分类回归树(Classification And Regression Trees,CART)是一种构造树的监督学习方法。
和ID3决策树作比较:
1. ID3每次直接用最佳特征分割数据,即如果当前特征有4个可能值,那么数据将被分成4份,处理的是标称型数据,不能直接处理连续型数据。CART则利用二元切分来处理连续型变量,每次会找一个最佳特征的阈值,把数据集分成两部分,也就是左子树和右子树。
2. CART使用方差计算来代替香农熵。但目的都是找最佳切分特征。
import numpy as np'''CART使用二元切分来处理连续型变量。回归树和分类树类似,只是叶节点的数据类型是连续型不是离散型(其实也不是真正的“连续”,切分时仍取决于属性值,只不过数值都是浮点数)以下是两种CART:回归树,模型树'''def loadData(filename): dataM = [] fr = open(filename) for line in fr.readlines(): curLine = line.strip().split('\t') fltLine = map(float, curLine) # 每行存成一组浮点数 dataM.append(fltLine) return dataM# ----------------- 回归树(regression tree)每个叶节点包含单个值 -------------------def regLeaf(data): # 数据不需要再切分时用来生成叶节点(常量) return np.mean(data[:,-1])def regErr(data): # 误差用均方差计算 return np.var(data[:,-1]) * np.shape(data)[0]# 找最佳的切分的位置(特征)和阈值def chooseBestSplit(data, leafType=regLeaf, errType=regErr, ops=(1,4)): tolS = ops[0] # 允许的误差减少量的最低值 tolN = ops[1] # 允许切分的最少样本数 if len(set(data[:,-1].T.tolist()[0])) == 1: # 标签值只有一个值(都是一类) return None, leafType(data) m, n = np.shape(data) S = errType(data) # 目标变量的总方差 bestS = inf bestIdx = 0 bestVal = 0 for featIdx in range(n-1): for splitVal in set(data[:, featIdx]): mat0, mat1 = binSplitData(data, featIdx, splitVal) if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue # 划分条件 newS = errType(mat0) + errType(mat1) if newS < bestS: bestIdx = featIdx bestVal = splitVal bestS = newS if (S-newS) < tolS: return None, leafType(data) # 如果误差变化量很小就退出 mat0, mat1 = binSplitData(data, bestIdx, bestVal) if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): return None, leafType(data) # 如果切分的数据集很小也退出 return bestIdx, bestVal# 数据集的切分函数def binSplitData(data, feature, value): mat0 = data[np.nonzero(data[:, feature] > value)[0], :] # 左子树 mat1 = data[np.nonzero(data[:, feature] <= value)[0], :] # 右子树 return mat0, mat1def createTree(data, leafType=regLeaf, errType=regErr, ops=(1,4)): feat, val = chooseBestSplit(data, leafType, errType, ops) if feat == None: # feat为None是chooseBestSplit()决定的不再切分数据 return val # val是leafType()生成的叶节点 (这里是常值, 变量均值) retTree = {} retTree['spInd'] = feat retTree['spVal'] = val lfData, rtData = binSplitData(data, feat, val) retTree['left'] = createTree(lfData, leafType, errType, ops) retTree['right']= createTree(rtData, leafType, errType, ops) return retTree# ------------------ 模型树(model tree)每个叶节点包含一个线性方程 -------------------def linearNode(data): m, n = np.shape(data) x = np.mat(np.ones((m,n))) y = np.mat(np.ones((m,1))) x[:, 1:n] = data[:, 0:n-1] y = data[:, -1] xTx = x.T * x if linalg.det(xTx) == 0.0: raise(NameError('This matrix is singular, cannot do inverse')) w = xTx.I * (x.T * y) return w, x, ydef modelLeaf(data): # 数据不需要再切分时用来生成叶节点(线性函数) w, x, y = linearNode(data) return wdef modelErr(data): # 误差用平方差计算 w, x, y = linearNode(data) yHat = x * w return np.sum(np.power(y-yHat, 2))def createTree(data, leafType=modelLeaf, errType=modelErr, ops=(1,4)): feat, val = chooseBestSplit(data, leafType, errType, ops) if feat == None: # feat为None是chooseBestSplit()决定的不再切分数据 return val # val是leafType()生成的叶节点 (这里是直线, 回归系数 ) retTree = {} retTree['spInd'] = feat retTree['spVal'] = val lfData, rtData = binSplitData(data, feat, val) retTree['left'] = createTree(lfData, leafType, errType, ops) retTree['right']= createTree(rtData, leafType, errType, ops) return retTree# ----------------------------- 回归树和模型树做预测 ----------------------------------def regTreeEval(treeNode, xdata): # 叶节点为常量值 return float(treeNode)def modelTreeEval(treeNode, xdata): # 叶节点为回归系数 n = np.shape(xdata)[1] x = np.mat(np.ones((1, n+1))) x[:, 1:n+1] = xdata return float(x*treeNode)def isTree(obj): return (type(obj).__name__ == 'dict')# modelEval指定树的类型,区分两种叶节点def treePredict(tree, xTest, modelEval=regTreeEval): if not isTree(tree): return modelEval(tree, xTest) if xTest[tree['spInd']] > tree['spVal']: # 划分特征的值大于阈值,分到左子树 if isTree(tree['left']): # 左子树还有分支 return treePredict(tree['left'], xTest, modelEval) else: # 左子树已经是叶节点 return modelEval(tree['left'], xTest) else: # 划分特征的值小于阈值,分到右子树 if isTree(tree['right']): return treePredict(tree['right'], xTest, modelEval) else: return modelEval(tree['right'], xTest)