Good day, reader. Random Forest today is one of the most popular and extremely effective methods for solving machine learning problems, such as classification and regression. In terms of efficiency, it competes with support vector machines, neural networks and boosting, although of course it is not without its drawbacks. In appearance, the learning algorithm is extremely simple (in comparison, let's say with the support vector machine learning algorithm, to whom there is little thrill in life, I strongly advise you to do this at your leisure). We will try to understand the basic ideas of Random Forest (binary decision tree, bootstrap aggregation or bagging, the method of random subspaces and decorrelation) in an accessible form and understand why all this works together. The model with respect to its competitors is still quite young: it all started with a 1997 article in which the authors proposed a method for constructing a single decision tree using the method of random subspaces of signs when creating new tree nodes; Then there was a series of articles, which ended with the publication of a canonical version of the algorithm in 2001 , in which an ensemble of decision trees was built based on bootstrap aggregation, or bugging. At the end, a simple, not at all smart, but extremely visual way of implementing this model in c # will be given, as well as a series of tests. By the way, in the photo on the right you can see a real random forest that grows here in the Kaliningrad region on the Curonian Spit .








- the probability vector obtained by the above procedure from a subset of the set A , consisting of those elements for which the condition f <x is satisfied. Also, do not forget that the average cost of a partition should not exceed the cost of the original set. Let's now go back to the original picture and look at what is really happening, divide the original set of data as described above:

rm(list=ls()) library(mvtnorm) labCount <- 100 lab1 <- rmvnorm(n=labCount, mean=c(1,1), sigma=diag(c(1, 1))) lab0 <- rmvnorm(n=labCount, mean=c(2,2), sigma=diag(c(0.5, 2))) df <- data.frame(x=append(lab1[, 1], lab0[, 1]), y=append(lab1[, 2], lab0[, 2]), lab=append(rep(1, labCount), rep(0, labCount))) plot(df$x, df$y, col=append(rep("red", labCount), rep("blue", labCount)), pch=19, xlab="Feature 1", ylab="Feature 2") giniIdx <- function(data) { p1 <- sum(data$lab == 1)/length(data$lab) p0 <- sum(data$lab == 0)/length(data$lab) return(p0*(1 - p0) + p1*(1 - p1)) } p.norm <- giniIdx getSeparator <- function(data) { idx <- NA idx.val <- NA cost <- p.norm(data) for(i in 1:(dim(data)[2] - 1)) { for(i.val in unique(data[, i])) { #print(paste("i = ", i, "; v = ", i.val, sep="")) cost.tmp <- 0.5*(p.norm(data[data[, i] < i.val, ]) + p.norm(data[data[, i] >= i.val, ])) if(is.nan(cost.tmp)) { next } if(cost.tmp < cost) { cost <- cost.tmp idx <- i idx.val <- i.val } } } return(c(idx, idx.val)) } s1 <- getSeparator(df) lines(c(-100, 100), c(s1[2], s1[2]), lty=2, lwd=2, type="l") 










// , public class TreeNode<T> { public TreeNode() { Childs = new LinkedList<TreeNode<T>>(); } public TreeNode(T data) { Data = data; Childs = new LinkedList<TreeNode<T>>(); } public TreeNode<T> Parent { get; set; } public LinkedList<TreeNode<T>> Childs { get; set; } public T Data { get; set; } public virtual bool AddChild(T data) { TreeNode<T> node = new TreeNode<T>() {Data = data}; node.Parent = this; Childs.AddLast(node); return true; } public virtual bool AddChild(TreeNode<T> node) { node.Parent = this; Childs.AddLast(node); return true; } public bool IsLeaf { get { return Childs.Count == 0; } } public int Depth { get { int d = 0; TreeNode<T> node = this; while (node.Parent != null) { d++; node = node.Parent; } return d; } } } public class DataItem<T> { private T[] _input = null; private T[] _output = null; public DataItem() { } public DataItem(T[] input, T[] output) { _input = input; _output = output; } public T[] Input { get { return _input; } set { _input = value; } } public T[] Output { get { return _output; } set { _output = value; } } } public class ClassificationTreeNodeData { // <id , >, // p internal IDictionary<double, double> Probabilities { get; set; } // , , // internal double Cost { get; set; } // , , // , -) internal Predicate<double[]> Predicate { get; set; } // , internal IList<DataItem<double>> DataSet { get; set; } // , internal string Name { get; set; } // internal int FeatureIndex { get; set; } // internal double FeatureValue { get; set; } // , [OnSerializing] private void OnSerializing(StreamingContext context) { Predicate = null; } [OnDeserialized] [OnSerialized] private void OnDeserialized(StreamingContext context) { Predicate = v => v[FeatureIndex] < FeatureValue; } } public class ClassificationBinaryTree { private TreeNode<ClassificationTreeNodeData> _rootNode = null; private INorm<double> _norm = null; // private int _minLeafDataCount = 1; // private int[] _trainingFeaturesSubset = null; // private int _randomSubsetSize = 0; // , private Random _random = null; private double _maxProbability = 0; // , private int _maxDepth = Int32.MaxValue; // private bool _showLog = false; private int _featuresCount = 0; public ClassificationBinaryTree(INorm<double> norm, int minLeafDataCount, int[] trainingFeaturesSubset = null, int randomSubsetSize = 0, double maxProbability = 0.95, int maxDepth = Int32.MaxValue, bool showLog = false) { _norm = norm; _minLeafDataCount = minLeafDataCount; _trainingFeaturesSubset = trainingFeaturesSubset; _randomSubsetSize = randomSubsetSize; _maxProbability = maxProbability; _maxDepth = maxDepth; _showLog = showLog; } public TreeNode<ClassificationTreeNodeData> RootNode { get { return _rootNode; } } // public void Train(IList<DataItem<double>> data) { _featuresCount = data.First().Input.Length; if (_randomSubsetSize > 0) { _random = new Random(Helper.GetSeed()); } IDictionary<double, double> rootProbs = ComputeLabelsProbabilities(data); _rootNode = new TreeNode<ClassificationTreeNodeData>(new ClassificationTreeNodeData() { DataSet = data, Probabilities = rootProbs, Cost = _norm.Calculate(rootProbs.Select(x => x.Value).ToArray()) }); // Queue<TreeNode<ClassificationTreeNodeData>> queue = new Queue<TreeNode<ClassificationTreeNodeData>>(); queue.Enqueue(_rootNode); while (queue.Count > 0) { if (_showLog) { Logger.Instance.Log("Tree training: queue size is " + queue.Count); } TreeNode<ClassificationTreeNodeData> node = queue.Dequeue(); int sourceCount = node.Data.DataSet.Count; // TrainNode(node, node.Data.DataSet, _trainingFeaturesSubset, _randomSubsetSize); if (_showLog && node.Childs.Count() > 0) { Logger.Instance.Log("Tree training: source " + sourceCount + " is splitted into " + node.Childs.First().Data.DataSet.Count + " and " + node.Childs.Last().Data.DataSet.Count); } // foreach (TreeNode<ClassificationTreeNodeData> child in node.Childs) { if (child.Data.Probabilities.Count == 1 || child.Data.DataSet.Count <= _minLeafDataCount || child.Data.Probabilities.First().Value > _maxProbability || child.Depth >= _maxDepth) { child.Data.DataSet = null; continue; } queue.Enqueue(child); } } } // private void TrainNode(TreeNode<ClassificationTreeNodeData> node, IList<DataItem<double>> data, int[] featuresSubset, int randomSubsetSize) { // argmin double minCost = node.Data.Cost; int idx = -1; double threshold = 0; IDictionary<double, double> minLeftProbs = null; IDictionary<double, double> minRightProbs = null; IList<DataItem<double>> minLeft = null; IList<DataItem<double>> minRight = null; double minLeftCost = 0; double minRightCost = 0; // , if (randomSubsetSize > 0) { featuresSubset = new int[randomSubsetSize]; IList<int> candidates = new List<int>(); for (int i = 0; i < _featuresCount; i++) { candidates.Add(i); } for (int i = 0; i < randomSubsetSize; i++) { int idxRandom = _random.Next(0, candidates.Count); featuresSubset[i] = candidates[idxRandom]; candidates.RemoveAt(idxRandom); } } else if (featuresSubset == null) { featuresSubset = new int[data.First().Input.Length]; for (int i = 0; i < data.First().Input.Length; i++) { featuresSubset[i] = i; } } // foreach (int i in featuresSubset) { IList<double> domain = data.Select(x => x.Input[i]).Distinct().ToList(); // foreach (double t in domain) { IList<DataItem<double>> left = new List<DataItem<double>>(); // IList<DataItem<double>> right = new List<DataItem<double>>(); // IDictionary<double, double> leftProbs = new Dictionary<double, double>(); // IDictionary<double, double> rightProbs = new Dictionary<double, double>(); foreach (DataItem<double> di in data) { if (di.Input[i] < t) { left.Add(di); if (!leftProbs.ContainsKey(di.Output[0])) { leftProbs.Add(di.Output[0], 0); } leftProbs[di.Output[0]]++; } else { right.Add(di); if (!rightProbs.ContainsKey(di.Output[0])) { rightProbs.Add(di.Output[0], 0); } rightProbs[di.Output[0]]++; } } if (right.Count == 0 || left.Count == 0) { continue; } // leftProbs = leftProbs.ToDictionary(x => x.Key, x => x.Value/left.Count); rightProbs = rightProbs.ToDictionary(x => x.Key, x => x.Value/right.Count); double leftCost = _norm.Calculate(leftProbs.Select(x => x.Value).ToArray()); // double rightCost = _norm.Calculate(rightProbs.Select(x => x.Value).ToArray()); double avgCost = (leftCost + rightCost)/2; // if (avgCost < minCost) { minCost = avgCost; idx = i; threshold = t; minLeftProbs = leftProbs; minRightProbs = rightProbs; minLeft = left; minRight = right; minLeftCost = leftCost; minRightCost = rightCost; } } } // node.Data.DataSet = null; if (idx != -1) { //node should be splitted node.Data.Predicate = v => v[idx] < threshold; // node.Data.Name = "x[" + idx + "] < " + threshold; node.Data.Probabilities = null; node.Data.FeatureIndex = idx; node.Data.FeatureValue = threshold; node.AddChild(new ClassificationTreeNodeData() { Probabilities = minLeftProbs.OrderByDescending(x => x.Value).ToDictionary(x => x.Key, x => x.Value), DataSet = minLeft, Cost = minLeftCost }); node.AddChild(new ClassificationTreeNodeData() { Probabilities = minRightProbs.OrderByDescending(x => x.Value).ToDictionary(x => x.Key, x => x.Value), DataSet = minRight, Cost = minRightCost }); } } // , private IDictionary<double, double> ComputeLabelsProbabilities(IList<DataItem<double>> data) { IDictionary<double, double> p = new Dictionary<double, double>(); double denominator = data.Count; foreach (double label in data.Select(x => x.Output[0]).Distinct()) { p.Add(label, data.Where(x => x.Output[0] == label).Count() / denominator); } return p; } // public IDictionary<double, double> Classify(double[] v) { TreeNode<ClassificationTreeNodeData> node = _rootNode; while (!node.IsLeaf) { node = node.Data.Predicate(v) ? node.Childs.First() : node.Childs.Last(); } return node.Data.Probabilities; } // GraphVis http://www.graphviz.org/ public void WriteDotFile(StreamWriter sw, bool separateTerminalNode = false) { sw.WriteLine("digraph G{"); sw.WriteLine("graph [ordering=\"out\"];"); Queue<TreeNode<ClassificationTreeNodeData>> q = new Queue<TreeNode<ClassificationTreeNodeData>>(); q.Enqueue(_rootNode); int terminalCount = 0; ISet<string> styles = new HashSet<string>(); while (q.Count > 0) { TreeNode<ClassificationTreeNodeData> node = q.Dequeue(); foreach (TreeNode<ClassificationTreeNodeData> child in node.Childs) { string childName = child.Data.Name; if (String.IsNullOrEmpty(childName)) { if (separateTerminalNode) { childName = "TNode #" + terminalCount + "; Class: " + child.Data.Probabilities.First().Key; } else { childName = "Class: " + child.Data.Probabilities.First().Key; } styles.Add("\"" + childName + "\" [" + "color=red, style=filled" + "];"); terminalCount++; } sw.WriteLine("\"" + node.Data.Name + "\" -> " + "\"" + childName + "\";"); q.Enqueue(child); } } foreach (string style in styles) { sw.WriteLine(style); } sw.WriteLine("}"); } } public interface INorm<T> { double Calculate(T[] v); } internal class GiniIndex : INorm<double> { #region INorm<double> Members public double Calculate(double[] v) { return v.Sum(p => p*(1 - p)); } #endregion } internal class MetricsBasedNorm<T> : INorm<T> { private IMetrics<T> _m = null; internal MetricsBasedNorm(IMetrics<T> m) { _m = m; } #region INorm<T> Members public double Calculate(T[] v) { return _m.Calculate(v, v); } #endregion } public interface IMetrics<T> { /// <summary> /// Calculate value of metrics /// </summary> double Calculate(T[] v1, T[] v2); /// <summary> /// Get centroid/clusteroid of data /// </summary> T[] GetCentroid(IList<T[]> data); /// <summary> /// Calculate value of partial derivative by v2[v2Index] /// </summary> T CalculatePartialDerivaitveByV2Index(T[] v1, T[] v2, int v2Index); } internal class CrossEntropy : MetricsBase<double> { internal CrossEntropy() { } /// <summary> /// \sum_i v1_i * ln(v2_i) /// </summary> public override double Calculate(double[] v1, double[] v2) { if (v1.Length != v2.Length) { throw new ArgumentException("Length of v1 and v2 should be equal"); } if (v1.Length == 0 || v2.Length == 0) { throw new ArgumentException("Vector dimension can't be 0"); } double d = 0; for (int i = 0; i < v1.Length; i++) { d += v1[i]*Math.Log(v2[i] + Double.Epsilon); } return -d; } public override double CalculatePartialDerivaitveByV2Index(double[] v1, double[] v2, int v2Index) { return v2[v2Index] - v1[v2Index]; } } public class ClassificationRandomForest { // , private INorm<double> _norm = null; private int _minLeafDataCount = 1; private int[] _trainingFeaturesSubset = null; private int _randomSubsetSize = 0; //zero if all features needed private double _maxProbability = 0; private int _maxDepth = Int32.MaxValue; private bool _showLog = false; private int _forestSize = 0; // private ConcurrentBag<ClassificationBinaryTree> _trees = null; public ClassificationRandomForest(INorm<double> norm, int forestSize, int minLeafDataCount, int[] trainingFeaturesSubset = null, int randomSubsetSize = 0, double maxProbability = 0.95, int maxDepth = Int32.MaxValue, bool showLog = false) { _norm = norm; _minLeafDataCount = minLeafDataCount; _trainingFeaturesSubset = trainingFeaturesSubset; _randomSubsetSize = randomSubsetSize; _maxProbability = maxProbability; _maxDepth = maxDepth; _forestSize = forestSize; _showLog = showLog; } public void Train(IList<DataItem<double>> data) { if (_showLog) { Logger.Instance.Log("Training is started"); } // , _trees = new ConcurrentBag<ClassificationBinaryTree>(); Parallel.For(0, _forestSize, i => { ClassificationBinaryTree ct = new ClassificationBinaryTree( _norm, _minLeafDataCount, _trainingFeaturesSubset, _randomSubsetSize, _maxProbability, _maxDepth, false ); ct.Train(BasicStatFunctions.Sample(data, data.Count, true)); _trees.Add(ct); if (_showLog) { Logger.Instance.Log("Training of tree #" + _trees.Count + " is completed!"); } }); } // , bagging public IDictionary<double, double> Classify(double[] v) { IDictionary<double, double> p = new Dictionary<double, double>(); foreach (ClassificationBinaryTree ct in _trees) { IDictionary<double, double> tr = ct.Classify(v); double winClass = tr.First().Key; if (!p.ContainsKey(winClass)) { p.Add(winClass, 0); } p[winClass]++; } double denominator = p.Sum(x => x.Value); return p.ToDictionary(x => x.Key, x => x.Value/denominator) .OrderByDescending(x => x.Value) .ToDictionary(x => x.Key, x => x.Value); } public IList<ClassificationBinaryTree> Forest { get { return _trees.ToList(); } } } ClassificationRandomForest crf = new ClassificationRandomForest( NormCreator.CreateByMetrics(MetricsCreator.CrossEntropy()), 10, 1, null, Convert.ToInt32(Math.Round(Math.Sqrt(ds.TrainSet.First().Input.Length))), 0.95, 1000, true ); crf.Train(ds.TrainSet); foreach (ClassificationBinaryTree tree in crf.Forest) { using (StreamWriter sw = new StreamWriter(@"e:\Neuroximator\NetworkTrainingOCR\TreeTestData\Forest\" + (new DirectoryInfo(@"e:\Neuroximator\NetworkTrainingOCR\TreeTestData\Forest\")).GetFiles().Count() + ".dot")) { tree.WriteDotFile(sw); sw.Close(); } } dot.exe -Tpng "tree.dot" -o "tree.png" 


Source: https://habr.com/ru/post/215453/
All Articles