public class DecisionNode
{
public DecisionNode()
{
}
public DecisionNode(int col, object value, DecisionNode tb, DecisionNode fb)
{
Col = col;
Value = value;
Tb = tb;
Fb = fb;
}
public DecisionNode(Dictionary results)
{
Results = results;
}
public int Col { get; set; }
public object Value { get; set; }
public Dictionary Results { get; set; }
public DecisionNode Tb { get; set; }
public DecisionNode Fb { get; set; }
}
Col表示这个节点判断条件对应的上面表格的列的索引
Value 表示为了使判断条件为true,需要的值是多少
Tb 当此节点验证结果为true时对应的子节点
Fb 当此节点验证结果为false时对应的子节点
Results 只有叶节点这个属性不为空,表示这个分支的结果
构造决策树的函数返回一个根节点,沿着按条件沿着根节点的Tb或Fb往下,最终可以得到结果。
训练决策树
这里训练决策树的算法名为CART(Classification and Regression Trees,即分类回归树)。 算法首先创建一个根节点,然后评估表中所有观测变量,从中选出最合适的变量对数据进行拆分。 函数DivideSet就是用于对数据进行拆分,其接受三个参数,第一个是数据列表,第二个是表示需要拆分的参考列在列表中位置的数字,最后一个是参考值。函数执行完成返回两个列表第一个是匹配参考值的所有记录,另一个是不匹配参考值的所有记录。 我们在类TreePredict中实现这个函数:
// 在某一列上对数据集合进行拆分,能处理数值型数据或名词性数据(字符串)
public Tuple, List> DivideSet(List rows, int column, object value)
{
// 定义一个lambda用于判断记录应该归为第一组还是第二组(即匹配参考值还是不匹配)
Func splitFunc = null;
if (value is int)
splitFunc = r => Convert.ToInt32(r[column]) >= Convert.ToInt32(value);
else if (value is float)
splitFunc = r => Convert.ToSingle(r[column]) >= Convert.ToSingle(value);
else
splitFunc = r => r[column].ToString() == value.ToString();
// 将数据集拆分成两个集合并返回
var set1 = rows.Where(r => splitFunc(r)).ToList();
var set2 = rows.Where(r => !splitFunc(r)).ToList();
return Tuple.Create(set1, set2);
}
函数中定义了名为splitFunc的lambda用于按照不同的类型对列值和参考值进行对比处理。
接着我们测试一下上面的函数,按照"是否阅读过FAQ"来对结果进行拆分:
var treePredict = new TreePredict();
var splitSet = treePredict.DivideSet(TreePredict.MyData, 2, "yes");
Action printRow = r => { Console.WriteLine($"{r[0]},{r[1]},{r[2]},{r[3]},{r[4]}"); };
Console.WriteLine("set1:");
splitSet.Item1.ForEach(r => printRow(r));
Console.WriteLine("set2:");
splitSet.Item2.ForEach(r => printRow(r));
// 对结果列(最后一列)进行计数
public Dictionary UniqueCounts(List rows)
{
var results = new Dictionary();
foreach (var row in rows)
{
// 计数结果在最后一列
var r = row.Last().ToString();
if (!results.ContainsKey(r))
results.Add(r, 0);
results[r] += 1;
}
return results;
}
// 熵是遍历所有可能结果之后所得到的p(x)log(p(x))之和
public float Entropy(List rows)
{
Func log2 = x => (float)(Math.Log(x) / Math.Log(2));
var results = UniqueCounts(rows);
// 开始计算熵值
var ent = 0f;
foreach (var r in results.Keys)
{
var p = results[r] / (float)rows.Count;
ent -= p * log2(p);
}
return ent;
}
public Dictionary Classify(object[] observation, DecisionNode tree)
{
if (tree.Results != null)
return tree.Results;
var v = observation[tree.Col];
DecisionNode branch;
if (v is int || v is float)
{
var val = v is int ? Convert.ToInt32(v) : Convert.ToSingle(v);
var treeVal = tree.Value is int ? Convert.ToInt32(tree.Value) : Convert.ToSingle(tree.Value);
branch = val >= treeVal ? tree.Tb : tree.Fb;
}
else
{
branch = v.ToString() == tree.Value.ToString() ? tree.Tb : tree.Fb;
}
return Classify(observation, branch);
}
使用这个函数来尝试分类一条记录:
var treePredict = new TreePredict();
var tree= treePredict.BuildTree(TreePredict.MyData);
var result = treePredict.Classify(new object[] {"(direct)","USA","yes",5}, tree);
Console.WriteLine(JsonConvert.SerializeObject(result));
var treePredict = new TreePredict();
var tree= treePredict.BuildTree(TreePredict.MyData);
treePredict.Prune(tree,0.1f);
treePredict.PrintTree(tree);
Console.WriteLine("--------------------------");
treePredict.Prune(tree, 1.01f);
treePredict.PrintTree(tree);
public Dictionary MdClassify(object[] observation, DecisionNode tree)
{
if (tree.Results != null)
return tree.Results.ToDictionary(r=>r.Key,r=>(float)r.Value);
var v = observation[tree.Col];
if (v == null)
{
var tr = MdClassify(observation, tree.Tb);
var fr = MdClassify(observation, tree.Fb);
var tcount = tr.Values.Count;
var fcount = fr.Values.Count;
var tw = tcount / (float)(tcount + fcount);
var fw = fcount / (float)(tcount + fcount);
var result = tr.ToDictionary(trKvp => trKvp.Key, trKvp => trKvp.Value*tw);
foreach (var frKvp in fr)
{
if (!result.ContainsKey(frKvp.Key))
result.Add(frKvp.Key, 0);
result[frKvp.Key] += frKvp.Value * fw;
}
return result;
}
else
{
DecisionNode branch;
if (v is int || v is float)
{
var val = v is int ? Convert.ToInt32(v) : Convert.ToSingle(v);
var treeVal = tree.Value is int ? Convert.ToInt32(tree.Value) : Convert.ToSingle(tree.Value);
branch = val >= treeVal ? tree.Tb : tree.Fb;
}
else
{
branch = v.ToString() == tree.Value.ToString() ? tree.Tb : tree.Fb;
}
return MdClassify(observation, branch);
}
}
如果发现缺失的列,则其左右分值的概率将重新计算并乘以各自的权重。
最后来看看使用有列缺失的数据进行分类测试的结果:
var treePredict = new TreePredict();
var tree = treePredict.BuildTree(TreePredict.MyData);
var result = treePredict.MdClassify(new object[] { "google", null, "yes", null }, tree);
Console.WriteLine(JsonConvert.SerializeObject(result));
result = treePredict.MdClassify(new object[] { "google", "France", null, null }, tree);
Console.WriteLine(JsonConvert.SerializeObject(result));
public float Variance(List rows)
{
if (rows.Count == 0) return 0;
var data = rows.Select(r => Convert.ToSingle(r.Last())).ToList();
var mean = data.Average();
var variance = data.Select(d => (float) Math.Pow(d - mean, 2)).Average();
return variance;
}
声明: 本文只为方便我个人查阅和理解,详细的分析以及源代码请移步 原作者的博客http://chjavach.iteye.com/
public class Singleton {
}
/*
* 懒汉模式。注意,getInstance如果在多线程环境中调用,需要加上synchronized,否则存在线程不安全问题
*/
class LazySingleton
这个月公司安排我一个人做iOS客户端开发,由于急着用,我先发布一个版本,由于第一次发布iOS应用,期间出了不少问题,记录于此。
1、使用Application Loader 发布时报错:Communication error.please use diagnostic mode to check connectivity.you need to have outbound acc
/*
2013年3月15日15:16:24
malloc 就memory(内存) allocate(分配)的缩写
本程序没有实际含义,只是理解使用
*/
# include <stdio.h>
# include <malloc.h>
int main(void)
{
int i = 5; //分配了4个字节 静态分配
int * p
http://wiki.sdn.sap.com/wiki/display/BOBJ/Optimize+query+with+Query+Stripping+in+Web+Intelligence
and a very straightfoward video
http://www.sdn.sap.com/irj/scn/events?rid=/library/uuid/40ec3a0c-936