AI3. 决策树的生成与训练-信息熵的计算
描述
Python 3 解法, 执行用时: 796ms, 内存消耗: 524288KB, 提交时间: 2022-07-05
# -*- coding: UTF-8 -*- from math import log import pandas as pd dataSet = pd.read_csv('dataSet.csv', header=None).values.tolist() def calcInfoEnt(dataSet): numEntres = len(dataSet) #code start here numEntries = len(dataSet) #数据集大小 labelCounts = {} for featVec in dataSet: # currentLabel = featVec[-1] #获取分类标签 if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 #字典值不等于0??? labelCounts[currentLabel] += 1 #每个类中数据个数统计 infoEnt = 0.0 for key in labelCounts: #信息熵计算 prob = float(labelCounts[key])/numEntries infoEnt -= prob * log(prob,2) return infoEnt #code end here #返回值 infoEnt 为数据集的信息熵,表示为 float 类型 if __name__ == '__main__': print(calcInfoEnt(dataSet)) #输出为当前数据集的信息熵
Python 3 解法, 执行用时: 808ms, 内存消耗: 524288KB, 提交时间: 2022-06-25
# -*- coding: UTF-8 -*- from math import log import pandas as pd dataSet = pd.read_csv('dataSet.csv', header=None).values.tolist() def calcInfoEnt(dataSet): numEntres = len(dataSet) #code start here labelcnt={}#用于统计正负样本的个数,本例下,统计完成以后labelcnt={'0':6, '1':9} for item in dataSet: if item[-1] not in labelcnt: labelcnt[item[-1]]=0 labelcnt[item[-1]]+=1 infoEnt=0.0 for item in labelcnt: #根据信息熵的公式计算信息熵 curr_info_entr=float(labelcnt[item])/numEntres infoEnt=infoEnt-curr_info_entr*log(curr_info_entr, 2) return infoEnt #code end here #返回值 infoEnt 为数据集的信息熵,表示为 float 类型 if __name__ == '__main__': print(calcInfoEnt(dataSet)) #输出为当前数据集的信息熵
Python 3 解法, 执行用时: 816ms, 内存消耗: 524288KB, 提交时间: 2022-07-07
from collections import Counter import numpy as np import pandas as pd dataSet = pd.read_csv('dataSet.csv', header=None).values[:, -1] def calcInfoEnt(dataSet): numEntres = len(dataSet) cnt = Counter(dataSet) # 计数每个值出现的次数 probability_lst = [1.0 * cnt[i] / numEntres for i in cnt] return -np.sum([p * np.log2(p) for p in probability_lst]) if __name__ == '__main__': print(calcInfoEnt(dataSet))
Python 3 解法, 执行用时: 816ms, 内存消耗: 524288KB, 提交时间: 2022-06-27
# -*- coding: UTF-8 -*- from math import log import pandas as pd dataSet = pd.read_csv('dataSet.csv', header=None).values.tolist() def calcInfoEnt(dataSet): numEntres = len(dataSet) #code start here labelcnt = {} for item in dataSet: if item[-1] not in labelcnt: labelcnt[item[-1]] = 0 labelcnt[item[-1]]+=1 infoEnt = 0.0 for item in labelcnt: curr_info_entr = float(labelcnt[item])/numEntres infoEnt=infoEnt-curr_info_entr*log(curr_info_entr,2) return infoEnt #code end here #返回值 infoEnt 为数据集的信息熵,表示为 float 类型 if __name__ == '__main__': print(calcInfoEnt(dataSet)) #输出为当前数据集的信息熵
Python 3 解法, 执行用时: 827ms, 内存消耗: 524288KB, 提交时间: 2022-07-21
# -*- coding: UTF-8 -*- from math import log import pandas as pd dataSet = pd.read_csv('dataSet.csv', header=None).values.tolist() def calcInfoEnt(dataSet): numEntres = len(dataSet) #code start here labelcnt = {} #用于统计正负样本的个数 for item in dataSet: if item[-1] not in labelcnt: labelcnt[item[-1]] = 0 labelcnt[item[-1]] += 1 infoEnt = 0.0 for item in labelcnt: #根据信息熵的公式计算信息熵 curr_info_entr = float(labelcnt[item]) / numEntres infoEnt = infoEnt - curr_info_entr * log(curr_info_entr,2) return infoEnt #code end here #返回值 infoEnt 为数据集的信息熵,表示为 float 类型 if __name__ == '__main__': print(calcInfoEnt(dataSet)) #输出为当前数据集的信息熵