Mon 21 Jul 22:43:21 CEST 2025
This commit is contained in:
		
							parent
							
								
									c2742ffb2c
								
							
						
					
					
						commit
						7e75307ca2
					
				
							
								
								
									
										412
									
								
								js/ml/C45.js
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										412
									
								
								js/ml/C45.js
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,412 @@ | |||
| /** | ||||
|  **      ============================== | ||||
|  **       O           O      O   OOOO | ||||
|  **       O           O     O O  O   O | ||||
|  **       O           O     O O  O   O | ||||
|  **       OOOO   OOOO O     OOO  OOOO | ||||
|  **       O   O       O    O   O O   O | ||||
|  **       O   O       O    O   O O   O | ||||
|  **       OOOO        OOOO O   O OOOO | ||||
|  **      ============================== | ||||
|  **      Dr. Stefan Bosse http://www.bsslab.de
 | ||||
|  ** | ||||
|  **      COPYRIGHT: THIS SOFTWARE, EXECUTABLE AND SOURCE CODE IS OWNED | ||||
|  **                 BY THE AUTHOR(S). | ||||
|  **                 THIS SOURCE CODE MAY NOT BE COPIED, EXTRACTED, | ||||
|  **                 MODIFIED, OR OTHERWISE USED IN A CONTEXT | ||||
|  **                 OUTSIDE OF THE SOFTWARE SYSTEM. | ||||
|  ** | ||||
|  **    $AUTHORS:     ?, Stefan Bosse | ||||
|  **    $INITIAL:     (C) ? | ||||
|  **    $MODIFIED:    (C) 2006-2018 bLAB by sbosse | ||||
|  **    $VERSION:     1.1.6 | ||||
|  ** | ||||
|  **    $INFO: | ||||
|  ** | ||||
|  ** C45 Decision Tree ML Algorithm | ||||
|  ** | ||||
|  ** Portable model | ||||
|  ** | ||||
|  **    $ENDOFINFO | ||||
|  */ | ||||
| 'use strict'; | ||||
| var Io = Require('com/io'); | ||||
| var Comp = Require('com/compat'); | ||||
| var current=none; | ||||
| var Aios=none; | ||||
| 
 | ||||
| var NODE_TYPES = { | ||||
|   RESULT: 'result', | ||||
|   FEATURE_NUMBER: 'feature_number',     // Number value node (cut split)
 | ||||
|   FEATURE_VALUE: 'feature_value',       // Category value
 | ||||
|   FEATURE_CATEGORY: 'feature_category'  // Symbolic variable node (split)
 | ||||
| }; | ||||
| 
 | ||||
| function unique(col) { | ||||
|   var u = {}, a = []; | ||||
|   for(var i = 0, l = col.length; i < l; ++i){ | ||||
|     if(u.hasOwnProperty(col[i])) { | ||||
|       continue; | ||||
|     } | ||||
|     a.push(col[i]); | ||||
|     u[col[i]] = 1; | ||||
|   } | ||||
|   return a; | ||||
| } | ||||
| 
 | ||||
| function find(col, pred) { | ||||
|   var value; | ||||
|   col.forEach(function(item) { | ||||
|     var result = pred(item); | ||||
|     if (result) { | ||||
|       value = item; | ||||
|     } | ||||
|   }); | ||||
|   return value; | ||||
| } | ||||
| 
 | ||||
| function max(array, fn) { | ||||
|   var max = -Infinity; | ||||
|   var index; | ||||
|   for (var i = 0; i < array.length; i++) { | ||||
|     var result = fn(array[i]); | ||||
|     if (result >= max) { | ||||
|       max = result; | ||||
|       index = i; | ||||
|     } | ||||
|   } | ||||
|   return typeof index !== 'undefined' ? array[index] : max; | ||||
| } | ||||
| 
 | ||||
| function sortBy(col, fn) { | ||||
|  col = [].slice.call(col); | ||||
|  return col.sort(fn); | ||||
| } | ||||
| 
 | ||||
| var C45 = { | ||||
|   create: function () { | ||||
|     return { | ||||
|       features : [], | ||||
|       targets: [], | ||||
|       model: null | ||||
|     } | ||||
|   }, | ||||
|   /** | ||||
|    * train | ||||
|    * | ||||
|    * @param {object} options | ||||
|    * @param {array} options.data - training data | ||||
|    * @param {string} options.target - class label | ||||
|    * @param {array} options.features - features names | ||||
|    * @param {array} options.featureTypes - features type (ie 'category', 'number') | ||||
|    */ | ||||
|   train: function(model,options) { | ||||
|     var data = options.data, | ||||
|         target = options.target, | ||||
|         features = options.features, | ||||
|         featureTypes = options.featureTypes; | ||||
|     featureTypes.forEach(function(f) { | ||||
|       if (['number','category'].indexOf(f) === -1) { | ||||
|         throw new Error('C4.5: Unrecognized option!'); | ||||
|       } | ||||
|     }); | ||||
| 
 | ||||
|     var targets = unique(data.map(function(d) { | ||||
|       return d[d.length-1]; | ||||
|     })); | ||||
|      | ||||
|     model.features = features; | ||||
|     model.targets = targets; | ||||
|     // model is the generated tree structure
 | ||||
|     model.model = C45._c45(model, data, target, features, featureTypes, 0); | ||||
|   }, | ||||
| 
 | ||||
|   _c45: function(model, data, target, features, featureTypes, depth) { | ||||
|     var targets = unique(data.map(function(d) { | ||||
|       return d[d.length-1]; | ||||
|     })); | ||||
| 
 | ||||
|     if (!targets.length) { | ||||
|       return { | ||||
|         type: 'result', | ||||
|         value: 'none data', | ||||
|         name: 'none data' | ||||
|       }; | ||||
|     } | ||||
| 
 | ||||
|     if (targets.length === 1) { | ||||
|       return { | ||||
|         type: 'result', | ||||
|         value: targets[0], | ||||
|         name: targets[0] | ||||
|       }; | ||||
|     } | ||||
| 
 | ||||
|     if (!features.length) { | ||||
|       var topTarget = C45.mostCommon(targets); | ||||
|       return { | ||||
|         type: 'result', | ||||
|         value: topTarget, | ||||
|         name: topTarget | ||||
|       }; | ||||
|     } | ||||
| 
 | ||||
|     var bestFeatureData = C45.maxGain(model, data, target, features, featureTypes); | ||||
|     var bestFeature = bestFeatureData.feature; | ||||
| 
 | ||||
|     var remainingFeatures = features.slice(0); | ||||
|     remainingFeatures.splice(features.indexOf(bestFeature), 1); | ||||
| 
 | ||||
|     if (featureTypes[model.features.indexOf(bestFeature)] === 'category') { | ||||
|       var possibleValues = unique(data.map(function(d) { | ||||
|         return d[model.features.indexOf(bestFeature)]; | ||||
|       })); | ||||
|       var node = { | ||||
|         name: bestFeature, | ||||
|         type: 'feature_category', | ||||
|         values: possibleValues.map(function(v) { | ||||
|           var newData = data.filter(function(x) { | ||||
|             return x[model.features.indexOf(bestFeature)] === v; | ||||
|           }); | ||||
|           var childNode = { | ||||
|             name: v, | ||||
|             type: 'feature_value', | ||||
|             child: C45._c45(model, newData, target, remainingFeatures, featureTypes, depth+1) | ||||
|           }; | ||||
|           return childNode; | ||||
|         }) | ||||
|       }; | ||||
|     } else if (featureTypes[model.features.indexOf(bestFeature)] === 'number') { | ||||
|       var possibleValues = unique(data.map(function(d) { | ||||
|         return d[model.features.indexOf(bestFeature)]; | ||||
|       })); | ||||
|       var node = { | ||||
|         name: bestFeature, | ||||
|         type: 'feature_number', | ||||
|         cut: bestFeatureData.cut, | ||||
|         values: [] | ||||
|       }; | ||||
| 
 | ||||
|       var newDataRight = data.filter(function(x) { | ||||
|         return parseFloat(x[model.features.indexOf(bestFeature)]) > bestFeatureData.cut; | ||||
|       }); | ||||
|       var childNodeRight = { | ||||
|         name: bestFeatureData.cut.toString(), | ||||
|         type: 'feature_value', | ||||
|         child: C45._c45(model, newDataRight, target, remainingFeatures, featureTypes, depth+1) | ||||
|       }; | ||||
|       node.values.push(childNodeRight); | ||||
| 
 | ||||
|       var newDataLeft = data.filter(function(x) { | ||||
|         return parseFloat(x[model.features.indexOf(bestFeature)]) <= bestFeatureData.cut; | ||||
|       }); | ||||
|       var childNodeLeft = { | ||||
|         name: bestFeatureData.cut.toString(), | ||||
|         type: 'feature_value', | ||||
|         child: C45._c45(model, newDataLeft, target, remainingFeatures, featureTypes, depth+1), | ||||
|       }; | ||||
|       node.values.push(childNodeLeft); | ||||
|     } | ||||
|     return node; | ||||
|   }, | ||||
| 
 | ||||
| 
 | ||||
|   classify: function (model,sample) { | ||||
|     // root is feature (attribute) containing all sub values
 | ||||
|     var childNode, featureName, sampleVal; | ||||
|     var root = model.model; | ||||
| 
 | ||||
|     if (typeof root === 'undefined') { | ||||
|       callback(new Error('model is undefined')); | ||||
|     } | ||||
| 
 | ||||
|     while (root.type != NODE_TYPES.RESULT) { | ||||
| 
 | ||||
|       if (root.type == NODE_TYPES.FEATURE_NUMBER) { | ||||
|         // feature number attribute
 | ||||
|         featureName = root.name; | ||||
|         sampleVal = parseFloat(sample[featureName]); | ||||
|         if (sampleVal <= root.cut) { | ||||
|           childNode = root.values[1]; | ||||
|         } else { | ||||
|           childNode = root.values[0]; | ||||
|         } | ||||
|       } else if (root.type == NODE_TYPES.FEATURE_CATEGORY) { | ||||
|         // feature category attribute
 | ||||
|         featureName = root.name; | ||||
|         sampleVal = sample[featureName]; | ||||
| 
 | ||||
|         // sub value , containing n childs
 | ||||
|         childNode = find(root.values, function(x) { | ||||
|           return x.name === sampleVal; | ||||
|         }); | ||||
|       } | ||||
| 
 | ||||
|       // non trained feature
 | ||||
|       if (typeof childNode === 'undefined') { | ||||
|         return 'unknown'; | ||||
|       } | ||||
|       root = childNode.child; | ||||
|     } | ||||
|     return root.value; | ||||
|   }, | ||||
| 
 | ||||
|   conditionalEntropy: function(model, data, feature, cut, target) { | ||||
|     var subset1 = data.filter(function(x) { | ||||
|       return parseFloat(x[model.features.indexOf(feature)]) <= cut; | ||||
|     }); | ||||
|     var subset2 = data.filter(function(x) { | ||||
|       return parseFloat(x[model.features.indexOf(feature)]) > cut; | ||||
|     }); | ||||
|     var setSize = data.length; | ||||
|     return subset1.length/setSize * C45.entropy(model, | ||||
|       subset1.map(function(d) { | ||||
|         return d[d.length-1]; | ||||
|       }) | ||||
|     ) + subset2.length/setSize*C45.entropy(model, | ||||
|       subset2.map(function(d) { | ||||
|         return d[d.length-1]; | ||||
|       }) | ||||
|     ); | ||||
|   }, | ||||
| 
 | ||||
|   count: function(target, targets) { | ||||
|     return targets.filter(function(t) { | ||||
|       return t === target; | ||||
|     }).length; | ||||
|   }, | ||||
| 
 | ||||
|   entropy: function(model, vals) { | ||||
|     var uniqueVals = unique(vals); | ||||
|     var probs = uniqueVals.map(function(x) { | ||||
|       return C45.prob(x, vals); | ||||
|     }); | ||||
|     var logVals = probs.map(function(p) { | ||||
|       return -p * C45.log2(p); | ||||
|     }); | ||||
|     return logVals.reduce(function(a, b) { | ||||
|       return a + b; | ||||
|     }, 0); | ||||
|   }, | ||||
| 
 | ||||
|   gain: function(model, data, target, features, feature, featureTypes) { | ||||
|     var setEntropy = C45.entropy(model, data.map(function(d) { | ||||
|       return d[d.length-1]; | ||||
|     })); | ||||
|     if (featureTypes[model.features.indexOf(feature)] === 'category') { | ||||
|       var attrVals = unique(data.map(function(d) { | ||||
|         return d[model.features.indexOf(feature)]; | ||||
|       })); | ||||
|       var setSize = data.length; | ||||
|       var entropies = attrVals.map(function(n) { | ||||
|         var subset = data.filter(function(x) { | ||||
|           return x[feature] === n; | ||||
|         }); | ||||
|         return (subset.length/setSize) * C45.entropy(model, | ||||
|           subset.map(function(d) { | ||||
|             return d[d.length-1]; | ||||
|           }) | ||||
|         ); | ||||
|       }); | ||||
|       var sumOfEntropies = entropies.reduce(function(a, b) { | ||||
|         return a + b; | ||||
|       }, 0); | ||||
|       return { | ||||
|         feature: feature, | ||||
|         gain: setEntropy - sumOfEntropies, | ||||
|         cut: 0 | ||||
|       }; | ||||
|     } else if (featureTypes[model.features.indexOf(feature)] === 'number') { | ||||
|       var attrVals = unique(data.map(function(d) { | ||||
|         return d[model.features.indexOf(feature)]; | ||||
|       })); | ||||
|       var gainVals = attrVals.map(function(cut) { | ||||
|         var cutf = parseFloat(cut); | ||||
|         var gain = setEntropy - C45.conditionalEntropy(model, data, feature, cutf, target); | ||||
|         return { | ||||
|             feature: feature, | ||||
|             gain: gain, | ||||
|             cut: cutf | ||||
|         }; | ||||
|       }); | ||||
|       var maxgain = max(gainVals, function(e) { | ||||
|         return e.gain; | ||||
|       }); | ||||
|       return maxgain; | ||||
|     } | ||||
|   }, | ||||
| 
 | ||||
|   log2: function(n) { | ||||
|     return Math.log(n) / Math.log(2); | ||||
|   }, | ||||
|    | ||||
|   maxGain: function(model, data, target, features, featureTypes) { | ||||
|     var g45 = features.map(function(feature) { | ||||
|       return C45.gain(model, data, target, features, feature, featureTypes); | ||||
|     }); | ||||
|     return max(g45, function(e) { | ||||
|       return e.gain; | ||||
|     }); | ||||
|   }, | ||||
| 
 | ||||
| 
 | ||||
|   mostCommon: function(targets) { | ||||
|     return sortBy(targets, function(target) { | ||||
|       return C45.count(target, targets); | ||||
|     }).reverse()[0]; | ||||
|   }, | ||||
| 
 | ||||
|   /** Print the tree | ||||
|   * | ||||
|   */ | ||||
|   print: function (model,indent) { | ||||
|     var NL = '\n', | ||||
|         line='',sep; | ||||
|     if (indent==undefined) indent=0; | ||||
|     if (!model) return ''; | ||||
|     var sp = function () {return Comp.string.create(indent);}; | ||||
|     switch (model.type) { | ||||
|       case NODE_TYPES.RESULT:  | ||||
|         return sp()+'-> '+model.name+NL; | ||||
|       case NODE_TYPES.FEATURE_CATEGORY: | ||||
|         line=sp()+'$'+model.name+'?'+NL; | ||||
|         Comp.array.iter(model.values,function (v) { | ||||
|           line += C45.print(v,indent+2); | ||||
|         });  | ||||
|         return line; | ||||
|       case NODE_TYPES.FEATURE_NUMBER: | ||||
|         line = sp()+'$'+model.name+'>'+model.cut+'?'+NL; | ||||
|         if (model.values[0].type==NODE_TYPES.FEATURE_VALUE) | ||||
|           line = line+C45.print(model.values[0].child,indent+2); | ||||
|         else | ||||
|           line = line+C45.print(model.values[0],indent+2); | ||||
|         line = line+sp()+'$'+model.name+'<='+model.cut+'?'+NL; | ||||
|         if (model.values[0].type==NODE_TYPES.FEATURE_VALUE) | ||||
|           line = line+C45.print(model.values[1].child,indent+2); | ||||
|         else | ||||
|           line = line+C45.print(model.values[1],indent+2); | ||||
|         return line; | ||||
|       case NODE_TYPES.FEATURE_VALUE: | ||||
|         line=sp()+''+model.name+NL; | ||||
|         line += C45.print(model.child,indent+2); | ||||
|         return line; | ||||
|     } | ||||
|     return 'model?'; | ||||
|   }, | ||||
| 
 | ||||
|   prob: function(target, targets) { | ||||
|     return C45.count(target,targets)/targets.length; | ||||
|   }, | ||||
| 
 | ||||
| }; | ||||
| 
 | ||||
| module.exports = { | ||||
|   classify:C45.classify, | ||||
|   create:C45.create, | ||||
|   entropy:C45.entropy, | ||||
|   log2:C45.log2, | ||||
|   print:function (model,indent) { return C45.print(model.model,indent) }, | ||||
|   unique:unique, | ||||
|   train:C45.train, | ||||
|   current:function (module) { current=module.current; Aios=module;}   | ||||
| } | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user