rm(list=ls()) library(mvtnorm) labCount <- 100 lab1 <- rmvnorm(n=labCount, mean=c(1,1), sigma=diag(c(1, 1))) lab0 <- rmvnorm(n=labCount, mean=c(2,2), sigma=diag(c(0.5, 2))) df <- data.frame(x=append(lab1[, 1], lab0[, 1]), y=append(lab1[, 2], lab0[, 2]), lab=append(rep(1, labCount), rep(0, labCount))) plot(df$x, df$y, col=append(rep("red", labCount), rep("blue", labCount)), pch=19, xlab="Feature 1", ylab="Feature 2") giniIdx <- function(data) { p1 <- sum(data$lab == 1)/length(data$lab) p0 <- sum(data$lab == 0)/length(data$lab) return(p0*(1 - p0) + p1*(1 - p1)) } p.norm <- giniIdx getSeparator <- function(data) { idx <- NA idx.val <- NA cost <- p.norm(data) for(i in 1:(dim(data)[2] - 1)) { for(i.val in unique(data[, i])) { #print(paste("i = ", i, "; v = ", i.val, sep="")) cost.tmp <- 0.5*(p.norm(data[data[, i] < i.val, ]) + p.norm(data[data[, i] >= i.val, ])) if(is.nan(cost.tmp)) { next } if(cost.tmp < cost) { cost <- cost.tmp idx <- i idx.val <- i.val } } } return(c(idx, idx.val)) } s1 <- getSeparator(df) lines(c(-100, 100), c(s1[2], s1[2]), lty=2, lwd=2, type="l")
// , public class TreeNode<T> { public TreeNode() { Childs = new LinkedList<TreeNode<T>>(); } public TreeNode(T data) { Data = data; Childs = new LinkedList<TreeNode<T>>(); } public TreeNode<T> Parent { get; set; } public LinkedList<TreeNode<T>> Childs { get; set; } public T Data { get; set; } public virtual bool AddChild(T data) { TreeNode<T> node = new TreeNode<T>() {Data = data}; node.Parent = this; Childs.AddLast(node); return true; } public virtual bool AddChild(TreeNode<T> node) { node.Parent = this; Childs.AddLast(node); return true; } public bool IsLeaf { get { return Childs.Count == 0; } } public int Depth { get { int d = 0; TreeNode<T> node = this; while (node.Parent != null) { d++; node = node.Parent; } return d; } } }
public class DataItem<T> { private T[] _input = null; private T[] _output = null; public DataItem() { } public DataItem(T[] input, T[] output) { _input = input; _output = output; } public T[] Input { get { return _input; } set { _input = value; } } public T[] Output { get { return _output; } set { _output = value; } } }
public class ClassificationTreeNodeData { // <id , >, // p internal IDictionary<double, double> Probabilities { get; set; } // , , // internal double Cost { get; set; } // , , // , -) internal Predicate<double[]> Predicate { get; set; } // , internal IList<DataItem<double>> DataSet { get; set; } // , internal string Name { get; set; } // internal int FeatureIndex { get; set; } // internal double FeatureValue { get; set; } // , [OnSerializing] private void OnSerializing(StreamingContext context) { Predicate = null; } [OnDeserialized] [OnSerialized] private void OnDeserialized(StreamingContext context) { Predicate = v => v[FeatureIndex] < FeatureValue; } }
public class ClassificationBinaryTree { private TreeNode<ClassificationTreeNodeData> _rootNode = null; private INorm<double> _norm = null; // private int _minLeafDataCount = 1; // private int[] _trainingFeaturesSubset = null; // private int _randomSubsetSize = 0; // , private Random _random = null; private double _maxProbability = 0; // , private int _maxDepth = Int32.MaxValue; // private bool _showLog = false; private int _featuresCount = 0; public ClassificationBinaryTree(INorm<double> norm, int minLeafDataCount, int[] trainingFeaturesSubset = null, int randomSubsetSize = 0, double maxProbability = 0.95, int maxDepth = Int32.MaxValue, bool showLog = false) { _norm = norm; _minLeafDataCount = minLeafDataCount; _trainingFeaturesSubset = trainingFeaturesSubset; _randomSubsetSize = randomSubsetSize; _maxProbability = maxProbability; _maxDepth = maxDepth; _showLog = showLog; } public TreeNode<ClassificationTreeNodeData> RootNode { get { return _rootNode; } } // public void Train(IList<DataItem<double>> data) { _featuresCount = data.First().Input.Length; if (_randomSubsetSize > 0) { _random = new Random(Helper.GetSeed()); } IDictionary<double, double> rootProbs = ComputeLabelsProbabilities(data); _rootNode = new TreeNode<ClassificationTreeNodeData>(new ClassificationTreeNodeData() { DataSet = data, Probabilities = rootProbs, Cost = _norm.Calculate(rootProbs.Select(x => x.Value).ToArray()) }); // Queue<TreeNode<ClassificationTreeNodeData>> queue = new Queue<TreeNode<ClassificationTreeNodeData>>(); queue.Enqueue(_rootNode); while (queue.Count > 0) { if (_showLog) { Logger.Instance.Log("Tree training: queue size is " + queue.Count); } TreeNode<ClassificationTreeNodeData> node = queue.Dequeue(); int sourceCount = node.Data.DataSet.Count; // TrainNode(node, node.Data.DataSet, _trainingFeaturesSubset, _randomSubsetSize); if (_showLog && node.Childs.Count() > 0) { Logger.Instance.Log("Tree training: source " + sourceCount + " is splitted into " + node.Childs.First().Data.DataSet.Count + " and " + node.Childs.Last().Data.DataSet.Count); } // foreach (TreeNode<ClassificationTreeNodeData> child in node.Childs) { if (child.Data.Probabilities.Count == 1 || child.Data.DataSet.Count <= _minLeafDataCount || child.Data.Probabilities.First().Value > _maxProbability || child.Depth >= _maxDepth) { child.Data.DataSet = null; continue; } queue.Enqueue(child); } } } // private void TrainNode(TreeNode<ClassificationTreeNodeData> node, IList<DataItem<double>> data, int[] featuresSubset, int randomSubsetSize) { // argmin double minCost = node.Data.Cost; int idx = -1; double threshold = 0; IDictionary<double, double> minLeftProbs = null; IDictionary<double, double> minRightProbs = null; IList<DataItem<double>> minLeft = null; IList<DataItem<double>> minRight = null; double minLeftCost = 0; double minRightCost = 0; // , if (randomSubsetSize > 0) { featuresSubset = new int[randomSubsetSize]; IList<int> candidates = new List<int>(); for (int i = 0; i < _featuresCount; i++) { candidates.Add(i); } for (int i = 0; i < randomSubsetSize; i++) { int idxRandom = _random.Next(0, candidates.Count); featuresSubset[i] = candidates[idxRandom]; candidates.RemoveAt(idxRandom); } } else if (featuresSubset == null) { featuresSubset = new int[data.First().Input.Length]; for (int i = 0; i < data.First().Input.Length; i++) { featuresSubset[i] = i; } } // foreach (int i in featuresSubset) { IList<double> domain = data.Select(x => x.Input[i]).Distinct().ToList(); // foreach (double t in domain) { IList<DataItem<double>> left = new List<DataItem<double>>(); // IList<DataItem<double>> right = new List<DataItem<double>>(); // IDictionary<double, double> leftProbs = new Dictionary<double, double>(); // IDictionary<double, double> rightProbs = new Dictionary<double, double>(); foreach (DataItem<double> di in data) { if (di.Input[i] < t) { left.Add(di); if (!leftProbs.ContainsKey(di.Output[0])) { leftProbs.Add(di.Output[0], 0); } leftProbs[di.Output[0]]++; } else { right.Add(di); if (!rightProbs.ContainsKey(di.Output[0])) { rightProbs.Add(di.Output[0], 0); } rightProbs[di.Output[0]]++; } } if (right.Count == 0 || left.Count == 0) { continue; } // leftProbs = leftProbs.ToDictionary(x => x.Key, x => x.Value/left.Count); rightProbs = rightProbs.ToDictionary(x => x.Key, x => x.Value/right.Count); double leftCost = _norm.Calculate(leftProbs.Select(x => x.Value).ToArray()); // double rightCost = _norm.Calculate(rightProbs.Select(x => x.Value).ToArray()); double avgCost = (leftCost + rightCost)/2; // if (avgCost < minCost) { minCost = avgCost; idx = i; threshold = t; minLeftProbs = leftProbs; minRightProbs = rightProbs; minLeft = left; minRight = right; minLeftCost = leftCost; minRightCost = rightCost; } } } // node.Data.DataSet = null; if (idx != -1) { //node should be splitted node.Data.Predicate = v => v[idx] < threshold; // node.Data.Name = "x[" + idx + "] < " + threshold; node.Data.Probabilities = null; node.Data.FeatureIndex = idx; node.Data.FeatureValue = threshold; node.AddChild(new ClassificationTreeNodeData() { Probabilities = minLeftProbs.OrderByDescending(x => x.Value).ToDictionary(x => x.Key, x => x.Value), DataSet = minLeft, Cost = minLeftCost }); node.AddChild(new ClassificationTreeNodeData() { Probabilities = minRightProbs.OrderByDescending(x => x.Value).ToDictionary(x => x.Key, x => x.Value), DataSet = minRight, Cost = minRightCost }); } } // , private IDictionary<double, double> ComputeLabelsProbabilities(IList<DataItem<double>> data) { IDictionary<double, double> p = new Dictionary<double, double>(); double denominator = data.Count; foreach (double label in data.Select(x => x.Output[0]).Distinct()) { p.Add(label, data.Where(x => x.Output[0] == label).Count() / denominator); } return p; } // public IDictionary<double, double> Classify(double[] v) { TreeNode<ClassificationTreeNodeData> node = _rootNode; while (!node.IsLeaf) { node = node.Data.Predicate(v) ? node.Childs.First() : node.Childs.Last(); } return node.Data.Probabilities; } // GraphVis http://www.graphviz.org/ public void WriteDotFile(StreamWriter sw, bool separateTerminalNode = false) { sw.WriteLine("digraph G{"); sw.WriteLine("graph [ordering=\"out\"];"); Queue<TreeNode<ClassificationTreeNodeData>> q = new Queue<TreeNode<ClassificationTreeNodeData>>(); q.Enqueue(_rootNode); int terminalCount = 0; ISet<string> styles = new HashSet<string>(); while (q.Count > 0) { TreeNode<ClassificationTreeNodeData> node = q.Dequeue(); foreach (TreeNode<ClassificationTreeNodeData> child in node.Childs) { string childName = child.Data.Name; if (String.IsNullOrEmpty(childName)) { if (separateTerminalNode) { childName = "TNode #" + terminalCount + "; Class: " + child.Data.Probabilities.First().Key; } else { childName = "Class: " + child.Data.Probabilities.First().Key; } styles.Add("\"" + childName + "\" [" + "color=red, style=filled" + "];"); terminalCount++; } sw.WriteLine("\"" + node.Data.Name + "\" -> " + "\"" + childName + "\";"); q.Enqueue(child); } } foreach (string style in styles) { sw.WriteLine(style); } sw.WriteLine("}"); } }
public interface INorm<T> { double Calculate(T[] v); }
internal class GiniIndex : INorm<double> { #region INorm<double> Members public double Calculate(double[] v) { return v.Sum(p => p*(1 - p)); } #endregion }
internal class MetricsBasedNorm<T> : INorm<T> { private IMetrics<T> _m = null; internal MetricsBasedNorm(IMetrics<T> m) { _m = m; } #region INorm<T> Members public double Calculate(T[] v) { return _m.Calculate(v, v); } #endregion } public interface IMetrics<T> { /// <summary> /// Calculate value of metrics /// </summary> double Calculate(T[] v1, T[] v2); /// <summary> /// Get centroid/clusteroid of data /// </summary> T[] GetCentroid(IList<T[]> data); /// <summary> /// Calculate value of partial derivative by v2[v2Index] /// </summary> T CalculatePartialDerivaitveByV2Index(T[] v1, T[] v2, int v2Index); } internal class CrossEntropy : MetricsBase<double> { internal CrossEntropy() { } /// <summary> /// \sum_i v1_i * ln(v2_i) /// </summary> public override double Calculate(double[] v1, double[] v2) { if (v1.Length != v2.Length) { throw new ArgumentException("Length of v1 and v2 should be equal"); } if (v1.Length == 0 || v2.Length == 0) { throw new ArgumentException("Vector dimension can't be 0"); } double d = 0; for (int i = 0; i < v1.Length; i++) { d += v1[i]*Math.Log(v2[i] + Double.Epsilon); } return -d; } public override double CalculatePartialDerivaitveByV2Index(double[] v1, double[] v2, int v2Index) { return v2[v2Index] - v1[v2Index]; } }
public class ClassificationRandomForest { // , private INorm<double> _norm = null; private int _minLeafDataCount = 1; private int[] _trainingFeaturesSubset = null; private int _randomSubsetSize = 0; //zero if all features needed private double _maxProbability = 0; private int _maxDepth = Int32.MaxValue; private bool _showLog = false; private int _forestSize = 0; // private ConcurrentBag<ClassificationBinaryTree> _trees = null; public ClassificationRandomForest(INorm<double> norm, int forestSize, int minLeafDataCount, int[] trainingFeaturesSubset = null, int randomSubsetSize = 0, double maxProbability = 0.95, int maxDepth = Int32.MaxValue, bool showLog = false) { _norm = norm; _minLeafDataCount = minLeafDataCount; _trainingFeaturesSubset = trainingFeaturesSubset; _randomSubsetSize = randomSubsetSize; _maxProbability = maxProbability; _maxDepth = maxDepth; _forestSize = forestSize; _showLog = showLog; } public void Train(IList<DataItem<double>> data) { if (_showLog) { Logger.Instance.Log("Training is started"); } // , _trees = new ConcurrentBag<ClassificationBinaryTree>(); Parallel.For(0, _forestSize, i => { ClassificationBinaryTree ct = new ClassificationBinaryTree( _norm, _minLeafDataCount, _trainingFeaturesSubset, _randomSubsetSize, _maxProbability, _maxDepth, false ); ct.Train(BasicStatFunctions.Sample(data, data.Count, true)); _trees.Add(ct); if (_showLog) { Logger.Instance.Log("Training of tree #" + _trees.Count + " is completed!"); } }); } // , bagging public IDictionary<double, double> Classify(double[] v) { IDictionary<double, double> p = new Dictionary<double, double>(); foreach (ClassificationBinaryTree ct in _trees) { IDictionary<double, double> tr = ct.Classify(v); double winClass = tr.First().Key; if (!p.ContainsKey(winClass)) { p.Add(winClass, 0); } p[winClass]++; } double denominator = p.Sum(x => x.Value); return p.ToDictionary(x => x.Key, x => x.Value/denominator) .OrderByDescending(x => x.Value) .ToDictionary(x => x.Key, x => x.Value); } public IList<ClassificationBinaryTree> Forest { get { return _trees.ToList(); } } }
ClassificationRandomForest crf = new ClassificationRandomForest( NormCreator.CreateByMetrics(MetricsCreator.CrossEntropy()), 10, 1, null, Convert.ToInt32(Math.Round(Math.Sqrt(ds.TrainSet.First().Input.Length))), 0.95, 1000, true ); crf.Train(ds.TrainSet);
foreach (ClassificationBinaryTree tree in crf.Forest) { using (StreamWriter sw = new StreamWriter(@"e:\Neuroximator\NetworkTrainingOCR\TreeTestData\Forest\" + (new DirectoryInfo(@"e:\Neuroximator\NetworkTrainingOCR\TreeTestData\Forest\")).GetFiles().Count() + ".dot")) { tree.WriteDotFile(sw); sw.Close(); } }
dot.exe -Tpng "tree.dot" -o "tree.png"
Source: https://habr.com/ru/post/215453/
All Articles