From 0a9cdd35c48631a69a962113a03ccc5f44af5a13 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Wed, 20 Jun 2018 11:21:34 +0800 Subject: [PATCH 01/49] 11 --- main.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 44d37d3..0302675 100644 --- a/main.py +++ b/main.py @@ -1 +1,6 @@ -# -*- coding:utf-8 -*- \ No newline at end of file +# -*- coding:utf-8 -*- + + + +if __name__=='__main__': + pass \ No newline at end of file From fbaba274cab4e46feb958246776267fff8cfe3f8 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Wed, 20 Jun 2018 13:41:41 +0800 Subject: [PATCH 02/49] add iris data --- iris.csv | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ main.py | 8 ++- 2 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 iris.csv diff --git a/iris.csv b/iris.csv new file mode 100644 index 0000000..c19b9c3 --- /dev/null +++ b/iris.csv @@ -0,0 +1,151 @@ +SepalLength,SepalWidth,PetalLength,PetalWidth,Name +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +4.3,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.7,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +7.7,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,1.9,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file diff --git a/main.py b/main.py index 0302675..c8a05a5 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,12 @@ # -*- coding:utf-8 -*- +import pandas as pd +def fileIO(path): + data=pd.read_csv(path) + print(data.columns) + print(data.describe()) if __name__=='__main__': - pass \ No newline at end of file + path=input('Please input the file path: ') + fileIO(path) \ No newline at end of file From 2e826c0ef4b8604aa9b5496cade6bb883bb31b0c Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Wed, 20 Jun 2018 14:15:07 +0800 Subject: [PATCH 03/49] add file information --- main.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index c8a05a5..873f756 100644 --- a/main.py +++ b/main.py @@ -1,12 +1,26 @@ # -*- coding:utf-8 -*- +__author__ = 'xujia' + import pandas as pd -def fileIO(path): - data=pd.read_csv(path) - print(data.columns) - print(data.describe()) + +def fileInfo(path): + ''' + 获取文件信息 + :param path: 文件路径 + :return: {字段名称:[字段类型,数据量,空值数]} + ''' + infodict = {} + data = pd.read_csv(path) + for c in data.columns: + infodict[c] = data[c].dtype + ctype = data[c].dtype + nc = data[c].size - data[c].notnull().sum() + infodict[c] = [ctype, data[c].size, nc] # 字段类型,数据量,空值个数 + return infodict -if __name__=='__main__': - path=input('Please input the file path: ') - fileIO(path) \ No newline at end of file +if __name__ == '__main__': + # path=input('Please input the file path: ') + path = 'iris.csv' + ret = fileInfo(path) From 067c87c4e8ecf7c16fd20b7d8c6ff6e7daa229a5 Mon Sep 17 00:00:00 2001 From: GiantTao Date: Wed, 20 Jun 2018 14:31:43 +0800 Subject: [PATCH 04/49] AR --- ARUtil.py | 164 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100755 ARUtil.py diff --git a/ARUtil.py b/ARUtil.py new file mode 100755 index 0000000..ca716ba --- /dev/null +++ b/ARUtil.py @@ -0,0 +1,164 @@ +# encoding:utf-8 +import pandas as pd +import numpy as np +import logging +import sys +reload(sys) +sys.setdefaultencoding('utf8') + + +class ARFilter(object): + + def __init__(self, threshold=0.05, dest_var='y'): + self.threshold = threshold + self.dest_var = dest_var + logging.basicConfig() + self.logger = logging.getLogger("default") + self.logger.setLevel(level=logging.INFO) + + def train_cal_input(self, excel_name='input.csv'): + """ + AR值筛选 + 输入:宽表【变量1、变量2、目标变量】、筛选下限(默认0.05)、目标变量名称(默认y) + 输出:筛选后的变量列表【变量名称,AR值】(按照AR值降序排列) + 计算方式:使用单个变量与目标变量进行逻辑回归运算,返回模型的K-S值即为该变量的AR值。 + """ + from sklearn.linear_model import LogisticRegression + from sklearn.metrics import roc_curve + data = pd.read_csv(excel_name) + # 创建逻辑回归模型 + logit_model = LogisticRegression() + final_list = [] + for col in data.columns.values[0:-1]: + if col != self.dest_var: + # 特征变量值 + X = data[col].values.reshape(-1, 1) + # 拆分数据集为训练集与测试集 + x_train = X[:-20] + x_test = X[-20:] + # 目标变量值 + y = data[self.dest_var].values.reshape(-1, 1) + y_train = y[:-20] + y_test = y[-20:] + # 数据拟合 + logit_model.fit(x_train, y_train) + # 每一列与y列做预测 + # prob = logit_model.predict_proba(data[col].values.reshape(-1, 1)) + prob = logit_model.predict_proba(x_test) + # prob[:, 1] 预测结果为两列,分别为0值可能性与1值可能性,此处取1值可能性 + # fpr, tpr, thresholds = roc_curve(data[self.dest_var].values.reshape(-1, 1), prob[:, 1]) + fpr, tpr, thresholds = roc_curve(y_test, prob[:, 1]) + from scipy import stats + # AR = float(stats.ks_2samp(y_test, prob[:, 1].reshape(-1, 1)).statistic) + # AR = float(stats.ks_2samp(y_test.ravel(), prob[:, 1]).statistic) + # testDF = pd.DataFrame() + # testDF['predict_proba'] = prob[:,1] + # testDF['label'] = np.array(y_test) + # print self.cal_ks(testDF) + # print str(AR) + "-" * 30 + ks = abs(fpr - tpr).max() + # print str(ks) + "*" * 30 + # print ks + if ks > self.threshold: + final_list.append({'varName': col, "AR": ks}) + else: + self.logger.info('列:' + col + '的AR值为:' + str(ks) + ", 低于阈值:" + str(self.threshold)) + # AR值排序 + final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True) + self.logger.info(pd.DataFrame(final_list)) + pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False) + + def cal_ks(self, data): + """手动计算KS值""" + # 对样本数据排序,根据预测值升序排序 + sorted_list = data.sort_values(['predict_proba'], ascending=True) + total_good_count = sorted_list['label'].sum() * 1.0 + total_bad_count = (sorted_list.shape[0] - total_good_count) * 1.0 + max_ks = 0.0 + good_count = 0.0 + bad_count = 0.0 + for index, row in sorted_list.iterrows(): + if row['label'] == 0: + bad_count += 1.0 + else: + good_count += 1.0 + val = abs(bad_count/total_bad_count - good_count/total_good_count) + max_ks = max(max_ks, val) + return max_ks + + def cal_ar(self, excel_name='test.xlsx'): + excel = pd.read_excel(excel_name) + if excel.columns.size < 2: + self.logger.error("未找到Excel数据源!") + return + dest_value = excel[self.dest_var] + final_list = [] + # result_frame = pd.DataFrame(columns=['varName', 'AR']) + for col in excel.columns: + if col != self.dest_var: + AR = float(stats.ks_2samp(excel[col], dest_value).statistic) + final_list + # self.logger.info(final_list) + # final_list.append({'AR': 1.0, 'colName': u'var3'}) + # final_list.append({'AR': 0.8, 'colName': u'var4'}) + final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True) + # self.logger.info("final result:" + str(final_list)) + # self.logger.info("123") + self.logger.info(pd.DataFrame(final_list)) + pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False) + + def fill_empty_value(self, col_name, file_name='input.xls', default_value=0): + """ + 缺失值填充 + 输入:宽表【变量1、变量2、目标变量】,变量名称,缺失值填充值(默认0) + 计算方式:直接将指定变量中的缺失值用参数中的填充值进行填充 + 输出:填充后的宽表,变量缺失率 + """ + data = pd.read_excel(file_name) + # print(str(np.nan)) + # print(type(str(np.nan))) + # print type(str(data['emptyCol'][14])) + # print len(str(data['emptyCol'][14]).strip()) + # print type(str(data['emptyCol'][14]).strip()) + if col_name not in data.columns.values: + self.logger.error("输入宽表中不存在指定变量") + return + else: + empty_count = data.shape[0] - data[col_name].count() + if empty_count > 0: + self.logger.info('当前共' + str(data.shape[0]) + '个变量值,其中缺失值个数为' + str(empty_count)) + # 替换空串为NAN + # data[col_name] = data[col_name].replace(' ', np.nan).fillna(value=default_value) + data['result'] = data[col_name].replace(' ', np.nan).fillna(value=default_value) + # self.logger.info('填补后,缺失值个数为' + str(data.shape[0] - data[col_name].count())) + self.logger.info('填补后,缺失值个数为' + str(data.shape[0] - data['result'].count())) + data.to_excel('result.xls', index=False) + else: + self.logger.info('当前不存在缺失值') + + def del_empty_value(self, file_name='input.xls', empty_rate_threshold=0.5): + """ + 缺失值剔除 + 输入:宽表【变量1、变量2、目标变量】,缺失率(默认0.5) + 计算方式:计算宽表中各个变量的缺失率,并剔除缺失率超过0.5的变量 + 输出:处理后宽表 + """ + data = pd.read_excel(file_name) + for col in data.columns.values: + if col == 'y': + continue + empty_ratio = (data[col].shape[0] - data[col].count())/data[col].shape[0] + if empty_ratio >= empty_rate_threshold: + self.logger.info("变量:" + col + "缺失率为" + str(empty_ratio) + ",高于阈值:" + str(empty_rate_threshold)) + data = data.drop(col, axis=1) + data.to_excel(file_name.split(".")[0] + "_new." + file_name.split(".")[1], index=False) + + +def run(): + ar = ARFilter() + ar.train_cal_input() + # ar.fill_empty_value(col_name='emptyCol', file_name='empty.xls', default_value=0) + # ar.del_empty_value(file_name="empty_ratio.xls") + +if __name__ == "__main__": + run() From 654484dd85de8f96780f1952688a5f587182684d Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Wed, 20 Jun 2018 15:08:06 +0800 Subject: [PATCH 05/49] add file information --- iris.csv | 302 +++++++++++++++++++++++++++---------------------------- 1 file changed, 151 insertions(+), 151 deletions(-) diff --git a/iris.csv b/iris.csv index c19b9c3..1f80bbe 100644 --- a/iris.csv +++ b/iris.csv @@ -1,151 +1,151 @@ -SepalLength,SepalWidth,PetalLength,PetalWidth,Name -5.1,3.5,1.4,0.2,Iris-setosa -4.9,3.0,1.4,0.2,Iris-setosa -4.7,3.2,1.3,0.2,Iris-setosa -4.6,3.1,1.5,0.2,Iris-setosa -5.0,3.6,1.4,0.2,Iris-setosa -5.4,3.9,1.7,0.4,Iris-setosa -4.6,3.4,1.4,0.3,Iris-setosa -5.0,3.4,1.5,0.2,Iris-setosa -4.4,2.9,1.4,0.2,Iris-setosa -4.9,3.1,1.5,0.1,Iris-setosa -5.4,3.7,1.5,0.2,Iris-setosa -4.8,3.4,1.6,0.2,Iris-setosa -4.8,3.0,1.4,0.1,Iris-setosa -4.3,3.0,1.1,0.1,Iris-setosa -5.8,4.0,1.2,0.2,Iris-setosa -5.7,4.4,1.5,0.4,Iris-setosa -5.4,3.9,1.3,0.4,Iris-setosa -5.1,3.5,1.4,0.3,Iris-setosa -5.7,3.8,1.7,0.3,Iris-setosa -5.1,3.8,1.5,0.3,Iris-setosa -5.4,3.4,1.7,0.2,Iris-setosa -5.1,3.7,1.5,0.4,Iris-setosa -4.6,3.6,1.0,0.2,Iris-setosa -5.1,3.3,1.7,0.5,Iris-setosa -4.8,3.4,1.9,0.2,Iris-setosa -5.0,3.0,1.6,0.2,Iris-setosa -5.0,3.4,1.6,0.4,Iris-setosa -5.2,3.5,1.5,0.2,Iris-setosa -5.2,3.4,1.4,0.2,Iris-setosa -4.7,3.2,1.6,0.2,Iris-setosa -4.8,3.1,1.6,0.2,Iris-setosa -5.4,3.4,1.5,0.4,Iris-setosa -5.2,4.1,1.5,0.1,Iris-setosa -5.5,4.2,1.4,0.2,Iris-setosa -4.9,3.1,1.5,0.1,Iris-setosa -5.0,3.2,1.2,0.2,Iris-setosa -5.5,3.5,1.3,0.2,Iris-setosa -4.9,3.1,1.5,0.1,Iris-setosa -4.4,3.0,1.3,0.2,Iris-setosa -5.1,3.4,1.5,0.2,Iris-setosa -5.0,3.5,1.3,0.3,Iris-setosa -4.5,2.3,1.3,0.3,Iris-setosa -4.4,3.2,1.3,0.2,Iris-setosa -5.0,3.5,1.6,0.6,Iris-setosa -5.1,3.8,1.9,0.4,Iris-setosa -4.8,3.0,1.4,0.3,Iris-setosa -5.1,3.8,1.6,0.2,Iris-setosa -4.6,3.2,1.4,0.2,Iris-setosa -5.3,3.7,1.5,0.2,Iris-setosa -5.0,3.3,1.4,0.2,Iris-setosa -7.0,3.2,4.7,1.4,Iris-versicolor -6.4,3.2,4.5,1.5,Iris-versicolor -6.9,3.1,4.9,1.5,Iris-versicolor -5.5,2.3,4.0,1.3,Iris-versicolor -6.5,2.8,4.6,1.5,Iris-versicolor -5.7,2.8,4.5,1.3,Iris-versicolor -6.3,3.3,4.7,1.6,Iris-versicolor -4.9,2.4,3.3,1.0,Iris-versicolor -6.6,2.9,4.6,1.3,Iris-versicolor -5.2,2.7,3.9,1.4,Iris-versicolor -5.0,2.0,3.5,1.0,Iris-versicolor -5.9,3.0,4.2,1.5,Iris-versicolor -6.0,2.2,4.0,1.0,Iris-versicolor -6.1,2.9,4.7,1.4,Iris-versicolor -5.6,2.9,3.6,1.3,Iris-versicolor -6.7,3.1,4.4,1.4,Iris-versicolor -5.6,3.0,4.5,1.5,Iris-versicolor -5.8,2.7,4.1,1.0,Iris-versicolor -6.2,2.2,4.5,1.5,Iris-versicolor -5.6,2.5,3.9,1.1,Iris-versicolor -5.9,3.2,4.8,1.8,Iris-versicolor -6.1,2.8,4.0,1.3,Iris-versicolor -6.3,2.5,4.9,1.5,Iris-versicolor -6.1,2.8,4.7,1.2,Iris-versicolor -6.4,2.9,4.3,1.3,Iris-versicolor -6.6,3.0,4.4,1.4,Iris-versicolor -6.8,2.8,4.8,1.4,Iris-versicolor -6.7,3.0,5.0,1.7,Iris-versicolor -6.0,2.9,4.5,1.5,Iris-versicolor -5.7,2.6,3.5,1.0,Iris-versicolor -5.5,2.4,3.8,1.1,Iris-versicolor -5.5,2.4,3.7,1.0,Iris-versicolor -5.8,2.7,3.9,1.2,Iris-versicolor -6.0,2.7,5.1,1.6,Iris-versicolor -5.4,3.0,4.5,1.5,Iris-versicolor -6.0,3.4,4.5,1.6,Iris-versicolor -6.7,3.1,4.7,1.5,Iris-versicolor -6.3,2.3,4.4,1.3,Iris-versicolor -5.6,3.0,4.1,1.3,Iris-versicolor -5.5,2.5,4.0,1.3,Iris-versicolor -5.5,2.6,4.4,1.2,Iris-versicolor -6.1,3.0,4.6,1.4,Iris-versicolor -5.8,2.6,4.0,1.2,Iris-versicolor -5.0,2.3,3.3,1.0,Iris-versicolor -5.6,2.7,4.2,1.3,Iris-versicolor -5.7,3.0,4.2,1.2,Iris-versicolor -5.7,2.9,4.2,1.3,Iris-versicolor -6.2,2.9,4.3,1.3,Iris-versicolor -5.1,2.5,3.0,1.1,Iris-versicolor -5.7,2.8,4.1,1.3,Iris-versicolor -6.3,3.3,6.0,2.5,Iris-virginica -5.8,2.7,5.1,1.9,Iris-virginica -7.1,3.0,5.9,2.1,Iris-virginica -6.3,2.9,5.6,1.8,Iris-virginica -6.5,3.0,5.8,2.2,Iris-virginica -7.6,3.0,6.6,2.1,Iris-virginica -4.9,2.5,4.5,1.7,Iris-virginica -7.3,2.9,6.3,1.8,Iris-virginica -6.7,2.5,5.8,1.8,Iris-virginica -7.2,3.6,6.1,2.5,Iris-virginica -6.5,3.2,5.1,2.0,Iris-virginica -6.4,2.7,5.3,1.9,Iris-virginica -6.8,3.0,5.5,2.1,Iris-virginica -5.7,2.5,5.0,2.0,Iris-virginica -5.8,2.8,5.1,2.4,Iris-virginica -6.4,3.2,5.3,2.3,Iris-virginica -6.5,3.0,5.5,1.8,Iris-virginica -7.7,3.8,6.7,2.2,Iris-virginica -7.7,2.6,6.9,2.3,Iris-virginica -6.0,2.2,5.0,1.5,Iris-virginica -6.9,3.2,5.7,2.3,Iris-virginica -5.6,2.8,4.9,2.0,Iris-virginica -7.7,2.8,6.7,2.0,Iris-virginica -6.3,2.7,4.9,1.8,Iris-virginica -6.7,3.3,5.7,2.1,Iris-virginica -7.2,3.2,6.0,1.8,Iris-virginica -6.2,2.8,4.8,1.8,Iris-virginica -6.1,3.0,4.9,1.8,Iris-virginica -6.4,2.8,5.6,2.1,Iris-virginica -7.2,3.0,5.8,1.6,Iris-virginica -7.4,2.8,6.1,1.9,Iris-virginica -7.9,3.8,6.4,2.0,Iris-virginica -6.4,2.8,5.6,2.2,Iris-virginica -6.3,2.8,5.1,1.5,Iris-virginica -6.1,2.6,5.6,1.4,Iris-virginica -7.7,3.0,6.1,2.3,Iris-virginica -6.3,3.4,5.6,2.4,Iris-virginica -6.4,3.1,5.5,1.8,Iris-virginica -6.0,3.0,4.8,1.8,Iris-virginica -6.9,3.1,5.4,2.1,Iris-virginica -6.7,3.1,5.6,2.4,Iris-virginica -6.9,3.1,5.1,2.3,Iris-virginica -5.8,2.7,5.1,1.9,Iris-virginica -6.8,3.2,5.9,2.3,Iris-virginica -6.7,3.3,5.7,2.5,Iris-virginica -6.7,3.0,5.2,2.3,Iris-virginica -6.3,2.5,5.0,1.9,Iris-virginica -6.5,3.0,5.2,2.0,Iris-virginica -6.2,3.4,5.4,2.3,Iris-virginica -5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file +SepalLength,SepalWidth,PetalLength,PetalWidth,Label +5.1,3.5,1.4,0.2,0 +4.9,3.0,1.4,0.2,0 +4.7,3.2,1.3,0.2,0 +4.6,3.1,1.5,0.2,0 +5.0,3.6,1.4,0.2,0 +5.4,3.9,1.7,0.4,0 +4.6,3.4,1.4,0.3,0 +5.0,3.4,1.5,0.2,0 +4.4,2.9,1.4,0.2,0 +4.9,3.1,1.5,0.1,0 +5.4,3.7,1.5,0.2,0 +4.8,3.4,1.6,0.2,0 +4.8,3.0,1.4,0.1,0 +4.3,3.0,1.1,0.1,0 +5.8,4.0,1.2,0.2,0 +5.7,4.4,1.5,0.4,0 +5.4,3.9,1.3,0.4,0 +5.1,3.5,1.4,0.3,0 +5.7,3.8,1.7,0.3,0 +5.1,3.8,1.5,0.3,0 +5.4,3.4,1.7,0.2,0 +5.1,3.7,1.5,0.4,0 +4.6,3.6,1.0,0.2,0 +5.1,3.3,1.7,0.5,0 +4.8,3.4,1.9,0.2,0 +5.0,3.0,1.6,0.2,0 +5.0,3.4,1.6,0.4,0 +5.2,3.5,1.5,0.2,0 +5.2,3.4,1.4,0.2,0 +4.7,3.2,1.6,0.2,0 +4.8,3.1,1.6,0.2,0 +5.4,3.4,1.5,0.4,0 +5.2,4.1,1.5,0.1,0 +5.5,4.2,1.4,0.2,0 +4.9,3.1,1.5,0.1,0 +5.0,3.2,1.2,0.2,0 +5.5,3.5,1.3,0.2,0 +4.9,3.1,1.5,0.1,0 +4.4,3.0,1.3,0.2,0 +5.1,3.4,1.5,0.2,0 +5.0,3.5,1.3,0.3,0 +4.5,2.3,1.3,0.3,0 +4.4,3.2,1.3,0.2,0 +5.0,3.5,1.6,0.6,0 +5.1,3.8,1.9,0.4,0 +4.8,3.0,1.4,0.3,0 +5.1,3.8,1.6,0.2,0 +4.6,3.2,1.4,0.2,0 +5.3,3.7,1.5,0.2,0 +5.0,3.3,1.4,0.2,0 +7.0,3.2,4.7,1.4,1 +6.4,3.2,4.5,1.5,1 +6.9,3.1,4.9,1.5,1 +5.5,2.3,4.0,1.3,1 +6.5,2.8,4.6,1.5,1 +5.7,2.8,4.5,1.3,1 +6.3,3.3,4.7,1.6,1 +4.9,2.4,3.3,1.0,1 +6.6,2.9,4.6,1.3,1 +5.2,2.7,3.9,1.4,1 +5.0,2.0,3.5,1.0,1 +5.9,3.0,4.2,1.5,1 +6.0,2.2,4.0,1.0,1 +6.1,2.9,4.7,1.4,1 +5.6,2.9,3.6,1.3,1 +6.7,3.1,4.4,1.4,1 +5.6,3.0,4.5,1.5,1 +5.8,2.7,4.1,1.0,1 +6.2,2.2,4.5,1.5,1 +5.6,2.5,3.9,1.1,1 +5.9,3.2,4.8,1.8,1 +6.1,2.8,4.0,1.3,1 +6.3,2.5,4.9,1.5,1 +6.1,2.8,4.7,1.2,1 +6.4,2.9,4.3,1.3,1 +6.6,3.0,4.4,1.4,1 +6.8,2.8,4.8,1.4,1 +6.7,3.0,5.0,1.7,1 +6.0,2.9,4.5,1.5,1 +5.7,2.6,3.5,1.0,1 +5.5,2.4,3.8,1.1,1 +5.5,2.4,3.7,1.0,1 +5.8,2.7,3.9,1.2,1 +6.0,2.7,5.1,1.6,1 +5.4,3.0,4.5,1.5,1 +6.0,3.4,4.5,1.6,1 +6.7,3.1,4.7,1.5,1 +6.3,2.3,4.4,1.3,1 +5.6,3.0,4.1,1.3,1 +5.5,2.5,4.0,1.3,1 +5.5,2.6,4.4,1.2,1 +6.1,3.0,4.6,1.4,1 +5.8,2.6,4.0,1.2,1 +5.0,2.3,3.3,1.0,1 +5.6,2.7,4.2,1.3,1 +5.7,3.0,4.2,1.2,1 +5.7,2.9,4.2,1.3,1 +6.2,2.9,4.3,1.3,1 +5.1,2.5,3.0,1.1,1 +5.7,2.8,4.1,1.3,1 +6.3,3.3,6.0,2.5,2 +5.8,2.7,5.1,1.9,2 +7.1,3.0,5.9,2.1,2 +6.3,2.9,5.6,1.8,2 +6.5,3.0,5.8,2.2,2 +7.6,3.0,6.6,2.1,2 +4.9,2.5,4.5,1.7,2 +7.3,2.9,6.3,1.8,2 +6.7,2.5,5.8,1.8,2 +7.2,3.6,6.1,2.5,2 +6.5,3.2,5.1,2.0,2 +6.4,2.7,5.3,1.9,2 +6.8,3.0,5.5,2.1,2 +5.7,2.5,5.0,2.0,2 +5.8,2.8,5.1,2.4,2 +6.4,3.2,5.3,2.3,2 +6.5,3.0,5.5,1.8,2 +7.7,3.8,6.7,2.2,2 +7.7,2.6,6.9,2.3,2 +6.0,2.2,5.0,1.5,2 +6.9,3.2,5.7,2.3,2 +5.6,2.8,4.9,2.0,2 +7.7,2.8,6.7,2.0,2 +6.3,2.7,4.9,1.8,2 +6.7,3.3,5.7,2.1,2 +7.2,3.2,6.0,1.8,2 +6.2,2.8,4.8,1.8,2 +6.1,3.0,4.9,1.8,2 +6.4,2.8,5.6,2.1,2 +7.2,3.0,5.8,1.6,2 +7.4,2.8,6.1,1.9,2 +7.9,3.8,6.4,2.0,2 +6.4,2.8,5.6,2.2,2 +6.3,2.8,5.1,1.5,2 +6.1,2.6,5.6,1.4,2 +7.7,3.0,6.1,2.3,2 +6.3,3.4,5.6,2.4,2 +6.4,3.1,5.5,1.8,2 +6.0,3.0,4.8,1.8,2 +6.9,3.1,5.4,2.1,2 +6.7,3.1,5.6,2.4,2 +6.9,3.1,5.1,2.3,2 +5.8,2.7,5.1,1.9,2 +6.8,3.2,5.9,2.3,2 +6.7,3.3,5.7,2.5,2 +6.7,3.0,5.2,2.3,2 +6.3,2.5,5.0,1.9,2 +6.5,3.0,5.2,2.0,2 +6.2,3.4,5.4,2.3,2 +5.9,3.0,5.1,1.8,2 \ No newline at end of file From 72c28ae051b9e150cdba7d0ba1118894f9f43424 Mon Sep 17 00:00:00 2001 From: GiantTao Date: Wed, 20 Jun 2018 16:36:35 +0800 Subject: [PATCH 06/49] Update ARUtil.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 注释编码 --- ARUtil.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ARUtil.py b/ARUtil.py index ca716ba..ed05eec 100755 --- a/ARUtil.py +++ b/ARUtil.py @@ -2,9 +2,9 @@ import pandas as pd import numpy as np import logging -import sys -reload(sys) -sys.setdefaultencoding('utf8') +# import sys +# reload(sys) +# sys.setdefaultencoding('utf8') class ARFilter(object): From 89457ac9b72f88c96813dc76fcb0ade13731fcb9 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Wed, 20 Jun 2018 16:57:05 +0800 Subject: [PATCH 07/49] add file data split --- main.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 873f756..3d360a4 100644 --- a/main.py +++ b/main.py @@ -2,13 +2,14 @@ __author__ = 'xujia' import pandas as pd +import numpy as np def fileInfo(path): ''' 获取文件信息 :param path: 文件路径 - :return: {字段名称:[字段类型,数据量,空值数]} + :return: {字段名称:[字段类型,数据量,空值个数]} ''' infodict = {} data = pd.read_csv(path) @@ -17,10 +18,29 @@ def fileInfo(path): ctype = data[c].dtype nc = data[c].size - data[c].notnull().sum() infodict[c] = [ctype, data[c].size, nc] # 字段类型,数据量,空值个数 - return infodict + return infodict, data + + +def dataSplit(data, ratio): + ''' + 数据分割 + :param data:带分割数据 + :param ratio: 分割比例 + :return: (数据集1,数据集2) + ''' + dataCount = data.shape[0] + selectedCount = int(dataCount * ratio) + if selectedCount > 0: + splitedData = np.split(data.sample(frac=1), [selectedCount], axis=0) + else: + return 'Data is too less' + return splitedData if __name__ == '__main__': # path=input('Please input the file path: ') path = 'iris.csv' - ret = fileInfo(path) + dict, data = fileInfo(path) + t = dataSplit(data, 0.8) + print(t[0]) + print(t[1]) From de96725dcc0dec677973d95bd6313bb62e9abd6e Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Wed, 20 Jun 2018 17:00:55 +0800 Subject: [PATCH 08/49] add file data split --- iris.csv | 52 +--------------------------------------------------- 1 file changed, 1 insertion(+), 51 deletions(-) diff --git a/iris.csv b/iris.csv index 1f80bbe..2b6058e 100644 --- a/iris.csv +++ b/iris.csv @@ -98,54 +98,4 @@ SepalLength,SepalWidth,PetalLength,PetalWidth,Label 5.7,2.9,4.2,1.3,1 6.2,2.9,4.3,1.3,1 5.1,2.5,3.0,1.1,1 -5.7,2.8,4.1,1.3,1 -6.3,3.3,6.0,2.5,2 -5.8,2.7,5.1,1.9,2 -7.1,3.0,5.9,2.1,2 -6.3,2.9,5.6,1.8,2 -6.5,3.0,5.8,2.2,2 -7.6,3.0,6.6,2.1,2 -4.9,2.5,4.5,1.7,2 -7.3,2.9,6.3,1.8,2 -6.7,2.5,5.8,1.8,2 -7.2,3.6,6.1,2.5,2 -6.5,3.2,5.1,2.0,2 -6.4,2.7,5.3,1.9,2 -6.8,3.0,5.5,2.1,2 -5.7,2.5,5.0,2.0,2 -5.8,2.8,5.1,2.4,2 -6.4,3.2,5.3,2.3,2 -6.5,3.0,5.5,1.8,2 -7.7,3.8,6.7,2.2,2 -7.7,2.6,6.9,2.3,2 -6.0,2.2,5.0,1.5,2 -6.9,3.2,5.7,2.3,2 -5.6,2.8,4.9,2.0,2 -7.7,2.8,6.7,2.0,2 -6.3,2.7,4.9,1.8,2 -6.7,3.3,5.7,2.1,2 -7.2,3.2,6.0,1.8,2 -6.2,2.8,4.8,1.8,2 -6.1,3.0,4.9,1.8,2 -6.4,2.8,5.6,2.1,2 -7.2,3.0,5.8,1.6,2 -7.4,2.8,6.1,1.9,2 -7.9,3.8,6.4,2.0,2 -6.4,2.8,5.6,2.2,2 -6.3,2.8,5.1,1.5,2 -6.1,2.6,5.6,1.4,2 -7.7,3.0,6.1,2.3,2 -6.3,3.4,5.6,2.4,2 -6.4,3.1,5.5,1.8,2 -6.0,3.0,4.8,1.8,2 -6.9,3.1,5.4,2.1,2 -6.7,3.1,5.6,2.4,2 -6.9,3.1,5.1,2.3,2 -5.8,2.7,5.1,1.9,2 -6.8,3.2,5.9,2.3,2 -6.7,3.3,5.7,2.5,2 -6.7,3.0,5.2,2.3,2 -6.3,2.5,5.0,1.9,2 -6.5,3.0,5.2,2.0,2 -6.2,3.4,5.4,2.3,2 -5.9,3.0,5.1,1.8,2 \ No newline at end of file +5.7,2.8,4.1,1.3,1 \ No newline at end of file From 20153a0003f8e04b15d329e4dee3be43e4332946 Mon Sep 17 00:00:00 2001 From: GiantTao Date: Thu, 21 Jun 2018 08:42:26 +0800 Subject: [PATCH 09/49] =?UTF-8?q?=E5=89=94=E9=99=A4=E9=83=A8=E5=88=86?= =?UTF-8?q?=E5=AD=97=E6=AE=B5=EF=BC=8C=E6=8F=90=E4=BE=9B=E7=BC=BA=E5=A4=B1?= =?UTF-8?q?=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- iris.csv | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/iris.csv b/iris.csv index 2b6058e..a320fb3 100644 --- a/iris.csv +++ b/iris.csv @@ -1,18 +1,18 @@ SepalLength,SepalWidth,PetalLength,PetalWidth,Label -5.1,3.5,1.4,0.2,0 -4.9,3.0,1.4,0.2,0 -4.7,3.2,1.3,0.2,0 -4.6,3.1,1.5,0.2,0 -5.0,3.6,1.4,0.2,0 -5.4,3.9,1.7,0.4,0 -4.6,3.4,1.4,0.3,0 +5.1,3.5,,0.2,0 +4.9,3.0,,0.2,0 +4.7,3.2,,0.2,0 +4.6,3.1,,0.2,0 +5.0,3.6,,0.2,0 +5.4,3.9,,0.4,0 +4.6,3.4,,0.3,0 5.0,3.4,1.5,0.2,0 4.4,2.9,1.4,0.2,0 4.9,3.1,1.5,0.1,0 5.4,3.7,1.5,0.2,0 4.8,3.4,1.6,0.2,0 4.8,3.0,1.4,0.1,0 -4.3,3.0,1.1,0.1,0 +4.3,,1.1,0.1,0 5.8,4.0,1.2,0.2,0 5.7,4.4,1.5,0.4,0 5.4,3.9,1.3,0.4,0 From 823958a3170820b6d1b81099d10a852b77bb7efa Mon Sep 17 00:00:00 2001 From: GiantTao Date: Thu, 21 Jun 2018 10:27:33 +0800 Subject: [PATCH 10/49] =?UTF-8?q?=E6=B5=81=E7=A8=8B=E5=8C=96=E5=A4=84?= =?UTF-8?q?=E7=90=86=E6=96=87=E4=BB=B6=E8=AF=BB=E5=8F=96=E3=80=81=E7=A9=BA?= =?UTF-8?q?=E5=80=BC=E5=A1=AB=E5=85=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ARUtil.py | 90 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 73 insertions(+), 17 deletions(-) diff --git a/ARUtil.py b/ARUtil.py index ed05eec..34ed408 100755 --- a/ARUtil.py +++ b/ARUtil.py @@ -107,43 +107,36 @@ def cal_ar(self, excel_name='test.xlsx'): self.logger.info(pd.DataFrame(final_list)) pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False) - def fill_empty_value(self, col_name, file_name='input.xls', default_value=0): + def fill_empty_value(self, col_name, data, default_value=0): """ 缺失值填充 输入:宽表【变量1、变量2、目标变量】,变量名称,缺失值填充值(默认0) 计算方式:直接将指定变量中的缺失值用参数中的填充值进行填充 输出:填充后的宽表,变量缺失率 """ - data = pd.read_excel(file_name) - # print(str(np.nan)) - # print(type(str(np.nan))) - # print type(str(data['emptyCol'][14])) - # print len(str(data['emptyCol'][14]).strip()) - # print type(str(data['emptyCol'][14]).strip()) + # data = pd.read_excel(file_name) if col_name not in data.columns.values: self.logger.error("输入宽表中不存在指定变量") return else: - empty_count = data.shape[0] - data[col_name].count() + empty_count = data[col_name].shape[0] - data[col_name].count() if empty_count > 0: self.logger.info('当前共' + str(data.shape[0]) + '个变量值,其中缺失值个数为' + str(empty_count)) # 替换空串为NAN - # data[col_name] = data[col_name].replace(' ', np.nan).fillna(value=default_value) - data['result'] = data[col_name].replace(' ', np.nan).fillna(value=default_value) - # self.logger.info('填补后,缺失值个数为' + str(data.shape[0] - data[col_name].count())) - self.logger.info('填补后,缺失值个数为' + str(data.shape[0] - data['result'].count())) - data.to_excel('result.xls', index=False) + data[col_name] = data[col_name].replace(' ', np.nan).fillna(value=default_value) + self.logger.info('填补后,缺失值个数为' + str(data[col_name].shape[0] - data[col_name].count())) + # data.to_excel('result.xls', index=False) + return data else: self.logger.info('当前不存在缺失值') - def del_empty_value(self, file_name='input.xls', empty_rate_threshold=0.5): + def del_empty_value(self, data, empty_rate_threshold=0.5): """ 缺失值剔除 输入:宽表【变量1、变量2、目标变量】,缺失率(默认0.5) 计算方式:计算宽表中各个变量的缺失率,并剔除缺失率超过0.5的变量 输出:处理后宽表 """ - data = pd.read_excel(file_name) for col in data.columns.values: if col == 'y': continue @@ -151,14 +144,77 @@ def del_empty_value(self, file_name='input.xls', empty_rate_threshold=0.5): if empty_ratio >= empty_rate_threshold: self.logger.info("变量:" + col + "缺失率为" + str(empty_ratio) + ",高于阈值:" + str(empty_rate_threshold)) data = data.drop(col, axis=1) - data.to_excel(file_name.split(".")[0] + "_new." + file_name.split(".")[1], index=False) + return data + # data.to_excel(file_name.split(".")[0] + "_new." + file_name.split(".")[1], index=False) + + def console_input(self, prompt="", if_value=[], else_value=[], if_rtn="", else_rtn=""): + rtn = input(prompt) + if rtn.strip() in if_value: + return if_rtn + elif rtn.strip() in else_value or len(else_value) == 0: + return else_rtn + else: + raise IOError("未匹配到条件") + + def file_info(self, path): + """ + 获取文件信息 + :param path: 文件路径 + :return: {字段名称:[字段类型,数据量,空值个数]} + """ + info_dict = {} + data = pd.read_csv(path) + for c in data.columns: + ctype = data[c].dtype + nc = data[c].size - data[c].notnull().sum() + info_dict[c] = [ctype, data[c].size, nc] # 字段类型,数据量,空值个数 + return info_dict, data + + def is_contain_empty_value(self, file_dict): + empty_col_list = [] + for item in file_dict: + self.logger.info(file_dict[item]) + if int(file_dict[item][2]) > 0: + self.logger.info("列" + item + "空值个数:" + str(file_dict[item][2])) + empty_col_list.append(item) + if len(empty_col_list) > 0: + return True, empty_col_list + else: + return False, [] + + def main(self): + file_path = input("请输入待处理的文件名路径:") + import os.path + if os.path.isfile(file_path): + file_dict, data = self.file_info(file_path) + is_contain_empty_value, empty_col_list = self.is_contain_empty_value(file_dict) + if is_contain_empty_value: + self.logger.info("当前存在缺失值") + is_fill_empty = self.console_input(prompt="是否需要填充数据?1:是,其他值:否", if_value=["1"], else_value=[], + if_rtn=True, else_rtn=False) + if is_fill_empty: + for col in empty_col_list: + fill_value = input("请输入列" + col + "待填充的数据:") + self.logger.info("列" + col + "将填充数据:" + fill_value) + data = self.fill_empty_value(col_name=col, data=data, default_value=fill_value) + print(data) + else: + self.logger.info("不填充数据,程序退出") + else: + self.logger.info("当前不存在缺失数据") + else: + self.logger.error("指定的文件路径不存在") def run(): ar = ARFilter() - ar.train_cal_input() + # ar.train_cal_input() # ar.fill_empty_value(col_name='emptyCol', file_name='empty.xls', default_value=0) # ar.del_empty_value(file_name="empty_ratio.xls") + ar.main() + if __name__ == "__main__": + run() + From 189d1422b249932db8489459d0a48e2621b3cb83 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Thu, 21 Jun 2018 10:36:58 +0800 Subject: [PATCH 11/49] add file data split --- main.py | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/main.py b/main.py index 3d360a4..0c684a0 100644 --- a/main.py +++ b/main.py @@ -14,20 +14,47 @@ def fileInfo(path): infodict = {} data = pd.read_csv(path) for c in data.columns: - infodict[c] = data[c].dtype ctype = data[c].dtype nc = data[c].size - data[c].notnull().sum() infodict[c] = [ctype, data[c].size, nc] # 字段类型,数据量,空值个数 return infodict, data -def dataSplit(data, ratio): +def changeType(df, featypedict): + typedict = {1: 'float64', 2: 'int64', 3: 'str'} + feadict = dict(zip(list(range(df.shape[1])), df.columns.values)) + + print('当前数据类型为:') + for (k, v) in featypedict.items(): + print(k.rjust(15), v[0]) + print('字段名称对应数字为:') + for (n, m) in feadict.items(): + print(n, m) + feaName = input('请输入如需要更改数据类型的字段对应的数字:') + if int(feaName) not in feadict.keys(): + feaName = input('输入字段名称错误,请重新输入:') + if int(feaName) not in feadict.keys(): + pass + feaName = feadict[int(feaName)] + + type = input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):') + if int(type) not in typedict.keys(): + type = input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):') + if int(type) not in typedict.keys(): + pass + type = typedict[int(type)] + + df[feaName] = df[feaName].astype(type) + + +def dataSplit(data): ''' 数据分割 :param data:带分割数据 :param ratio: 分割比例 :return: (数据集1,数据集2) ''' + ratio = float(input('请输入数据分割比例:')) dataCount = data.shape[0] selectedCount = int(dataCount * ratio) if selectedCount > 0: @@ -40,7 +67,11 @@ def dataSplit(data, ratio): if __name__ == '__main__': # path=input('Please input the file path: ') path = 'iris.csv' - dict, data = fileInfo(path) - t = dataSplit(data, 0.8) - print(t[0]) - print(t[1]) + feadict, data = fileInfo(path) + + changeType(data, feadict) + print(data.dtypes) + + t = dataSplit(data) + print(t[0].shape) + print(t[1].shape) From d3d19ebd0e173e1f78782ca57b16819b143921f8 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Thu, 21 Jun 2018 10:44:12 +0800 Subject: [PATCH 12/49] add file data split --- ARUtil.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ARUtil.py b/ARUtil.py index 34ed408..c898e01 100755 --- a/ARUtil.py +++ b/ARUtil.py @@ -16,6 +16,20 @@ def __init__(self, threshold=0.05, dest_var='y'): self.logger = logging.getLogger("default") self.logger.setLevel(level=logging.INFO) + def info_value(self): + """ + 信息熵 + :return: + """ + pass + + def chi_square(self): + """ + 卡方 + :return: + """ + pass + def train_cal_input(self, excel_name='input.csv'): """ AR值筛选 From 76e290ef3e43032efde8560dabd9c9645e4ade0e Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Thu, 21 Jun 2018 11:27:02 +0800 Subject: [PATCH 13/49] add file data split --- main.py | 84 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/main.py b/main.py index 0c684a0..254cdde 100644 --- a/main.py +++ b/main.py @@ -5,73 +5,77 @@ import numpy as np -def fileInfo(path): - ''' +def file_info(file_path): + """ 获取文件信息 - :param path: 文件路径 + :param file_path: 文件路径 :return: {字段名称:[字段类型,数据量,空值个数]} - ''' - infodict = {} - data = pd.read_csv(path) - for c in data.columns: - ctype = data[c].dtype - nc = data[c].size - data[c].notnull().sum() - infodict[c] = [ctype, data[c].size, nc] # 字段类型,数据量,空值个数 - return infodict, data + """ + info_dict = {} + raw_data = pd.read_csv(file_path) + for c in raw_data.columns: + c_type = raw_data[c].dtype + nc = raw_data[c].size - raw_data[c].notnull().sum() + info_dict[c] = [c_type, raw_data[c].size, nc] # 字段类型,数据量,空值个数 + return info_dict, data -def changeType(df, featypedict): - typedict = {1: 'float64', 2: 'int64', 3: 'str'} - feadict = dict(zip(list(range(df.shape[1])), df.columns.values)) +def change_type(df, fea_type_dict): + """ + 改变数据类型 + :param df: + :param fea_type_dict: + :return: + """ + type_dict = {1: 'float64', 2: 'int64', 3: 'str'} + fea_dict = dict(zip(list(range(df.shape[1])), df.columns.values)) print('当前数据类型为:') - for (k, v) in featypedict.items(): + for (k, v) in fea_type_dict.items(): print(k.rjust(15), v[0]) print('字段名称对应数字为:') for (n, m) in feadict.items(): print(n, m) - feaName = input('请输入如需要更改数据类型的字段对应的数字:') - if int(feaName) not in feadict.keys(): - feaName = input('输入字段名称错误,请重新输入:') - if int(feaName) not in feadict.keys(): + fea_name = int(input('请输入如需要更改数据类型的字段对应的数字:')) + if fea_name not in feadict.keys(): + fea_name = int(input('输入字段名称错误,请重新输入:')) + if fea_name not in feadict.keys(): pass - feaName = feadict[int(feaName)] + fea_name = fea_dict[fea_name] - type = input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):') - if int(type) not in typedict.keys(): - type = input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):') - if int(type) not in typedict.keys(): + target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):')) + if target_type not in type_dict.keys(): + target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):')) + if target_type not in type_dict.keys(): pass - type = typedict[int(type)] + type = type_dict[target_type] + df[fea_name] = df[fea_name].astype(type) - df[feaName] = df[feaName].astype(type) - -def dataSplit(data): - ''' +def data_split(data_to_split): + """ 数据分割 - :param data:带分割数据 - :param ratio: 分割比例 + :param data_to_split:带分割数据 :return: (数据集1,数据集2) - ''' + """ ratio = float(input('请输入数据分割比例:')) - dataCount = data.shape[0] - selectedCount = int(dataCount * ratio) - if selectedCount > 0: - splitedData = np.split(data.sample(frac=1), [selectedCount], axis=0) + data_count = data_to_split.shape[0] + selected_count = int(data_count * ratio) + if selected_count > 0: + split_data = np.split(data.sample(frac=1), [selected_count], axis=0) else: return 'Data is too less' - return splitedData + return split_data if __name__ == '__main__': # path=input('Please input the file path: ') path = 'iris.csv' - feadict, data = fileInfo(path) + feadict, data = file_info(path) - changeType(data, feadict) + change_type(data, feadict) print(data.dtypes) - t = dataSplit(data) + t = data_split(data) print(t[0].shape) print(t[1].shape) From 1262418e60a50fcafd2c15104f1030a391522f70 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Thu, 21 Jun 2018 11:30:53 +0800 Subject: [PATCH 14/49] add file data split --- main.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/main.py b/main.py index 254cdde..10f4322 100644 --- a/main.py +++ b/main.py @@ -28,31 +28,31 @@ def change_type(df, fea_type_dict): :return: """ type_dict = {1: 'float64', 2: 'int64', 3: 'str'} - fea_dict = dict(zip(list(range(df.shape[1])), df.columns.values)) + feature_dict = dict(zip(list(range(df.shape[1])), df.columns.values)) print('当前数据类型为:') for (k, v) in fea_type_dict.items(): print(k.rjust(15), v[0]) print('字段名称对应数字为:') - for (n, m) in feadict.items(): + for (n, m) in feature_dict.items(): print(n, m) fea_name = int(input('请输入如需要更改数据类型的字段对应的数字:')) - if fea_name not in feadict.keys(): + if fea_name not in feature_dict.keys(): fea_name = int(input('输入字段名称错误,请重新输入:')) - if fea_name not in feadict.keys(): + if fea_name not in fea_dict.keys(): pass - fea_name = fea_dict[fea_name] + fea_name = feature_dict[fea_name] target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):')) if target_type not in type_dict.keys(): target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):')) if target_type not in type_dict.keys(): pass - type = type_dict[target_type] - df[fea_name] = df[fea_name].astype(type) + target_type = type_dict[target_type] + df[fea_name] = df[fea_name].astype(target_type) -def data_split(data_to_split): +def split_data(data_to_split): """ 数据分割 :param data_to_split:带分割数据 @@ -71,11 +71,11 @@ def data_split(data_to_split): if __name__ == '__main__': # path=input('Please input the file path: ') path = 'iris.csv' - feadict, data = file_info(path) + fea_dict, data = file_info(path) - change_type(data, feadict) + change_type(data, fea_dict) print(data.dtypes) - t = data_split(data) + t = split_data(data) print(t[0].shape) print(t[1].shape) From a52e2d2fb76cd6eaf444373e1cb2c177c4e0f1d7 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Thu, 21 Jun 2018 13:27:10 +0800 Subject: [PATCH 15/49] add file data split --- main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 10f4322..f940fa7 100644 --- a/main.py +++ b/main.py @@ -17,7 +17,7 @@ def file_info(file_path): c_type = raw_data[c].dtype nc = raw_data[c].size - raw_data[c].notnull().sum() info_dict[c] = [c_type, raw_data[c].size, nc] # 字段类型,数据量,空值个数 - return info_dict, data + return info_dict, raw_data def change_type(df, fea_type_dict): @@ -33,6 +33,7 @@ def change_type(df, fea_type_dict): print('当前数据类型为:') for (k, v) in fea_type_dict.items(): print(k.rjust(15), v[0]) + print('字段名称对应数字为:') for (n, m) in feature_dict.items(): print(n, m) @@ -62,10 +63,10 @@ def split_data(data_to_split): data_count = data_to_split.shape[0] selected_count = int(data_count * ratio) if selected_count > 0: - split_data = np.split(data.sample(frac=1), [selected_count], axis=0) + splited_data = np.split(data.sample(frac=1), [selected_count], axis=0) else: return 'Data is too less' - return split_data + return splited_data if __name__ == '__main__': From f57615a055076e51d57c78fca5694a0b8085c384 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Thu, 21 Jun 2018 13:45:39 +0800 Subject: [PATCH 16/49] add binning --- ARUtil.py | 3 +-- binning.py | 34 ++++++++++++++++++++++++++++++++++ bins.py | 1 - 3 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 binning.py delete mode 100644 bins.py diff --git a/ARUtil.py b/ARUtil.py index c898e01..d8e8df5 100755 --- a/ARUtil.py +++ b/ARUtil.py @@ -204,8 +204,7 @@ def main(self): is_contain_empty_value, empty_col_list = self.is_contain_empty_value(file_dict) if is_contain_empty_value: self.logger.info("当前存在缺失值") - is_fill_empty = self.console_input(prompt="是否需要填充数据?1:是,其他值:否", if_value=["1"], else_value=[], - if_rtn=True, else_rtn=False) + is_fill_empty = self.console_input(prompt="是否需要填充数据?1:是,其他值:否", if_value=["1"], else_value=[],if_rtn=True, else_rtn=False) if is_fill_empty: for col in empty_col_list: fill_value = input("请输入列" + col + "待填充的数据:") diff --git a/binning.py b/binning.py new file mode 100644 index 0000000..7974076 --- /dev/null +++ b/binning.py @@ -0,0 +1,34 @@ +# -*- coding:utf-8 -*- +__author__ = 'xujia' + +import pandas as pd + + +def equal_distance_binning(data, fea_name): + """ + 等距分箱 + :param data: + :param fea_name: + :return: + """ + pass + + +def equal_frequency_binning(data, fea_name): + """ + 等频分箱 + :param data: + :param fea_name: + :return: + """ + pass + + +def auto_binning(data, fea_name): + """ + 自动分箱 + :param data: + :param fea_name: + :return: + """ + pass diff --git a/bins.py b/bins.py deleted file mode 100644 index 44d37d3..0000000 --- a/bins.py +++ /dev/null @@ -1 +0,0 @@ -# -*- coding:utf-8 -*- \ No newline at end of file From e379cb9d2b414e69d69ddbe2b9c7b2da775c968b Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Thu, 21 Jun 2018 14:03:08 +0800 Subject: [PATCH 17/49] add binning --- WOE.py | 182 +++++++++++++++++++++++++++++++++++++++++++++++++++++ binning.py | 18 +++--- 2 files changed, 191 insertions(+), 9 deletions(-) create mode 100644 WOE.py diff --git a/WOE.py b/WOE.py new file mode 100644 index 0000000..5858e2b --- /dev/null +++ b/WOE.py @@ -0,0 +1,182 @@ +# -*- coding:utf-8 -*- + +import pandas as pd +from math import log +import numpy as np +import math +from scipy import stats +from sklearn.utils.multiclass import type_of_target + + +class WOE: + def __init__(self): + self._WOE_MIN = -20 + self._WOE_MAX = 20 + + def woe(self, X, y, event=1): + ''' + Calculate woe of each feature category and information value + :param X: 2-D numpy array explanatory features which should be discreted already + :param y: 1-D numpy array target variable which should be binary + :param event: value of binary stands for the event to predict + :return: numpy array of woe dictionaries, each dictionary contains woe values for categories of each feature + numpy array of information value of each feature + ''' + self.check_target_binary(y) + X1 = self.feature_discretion(X) + + res_woe = [] + res_iv = [] + for i in range(0, X1.shape[-1]): + x = X1[:, i] + woe_dict, iv1 = self.woe_single_x(x, y, event) + res_woe.append(woe_dict) + res_iv.append(iv1) + return np.array(res_woe), np.array(res_iv) + + def woe_single_x(self, x, y, event=1): + """ + calculate woe and information for a single feature + :param x: 1-D numpy starnds for single feature + :param y: 1-D numpy array target variable + :param event: value of binary stands for the event to predict + :return: dictionary contains woe values for categories of this feature information value of this feature + """ + self.check_target_binary(y) + + event_total, non_event_total = self.count_binary(y, event=event) + x_labels = np.unique(x) + woe_dict = {} + iv = 0 + for x1 in x_labels: + y1 = y[np.where(x == x1)[0]] + event_count, non_event_count = self.count_binary(y1, event=event) + rate_event = 1.0 * event_count / event_total + rate_non_event = 1.0 * non_event_count / non_event_total + if rate_event == 0: + woe1 = self._WOE_MIN + elif rate_non_event == 0: + woe1 = self._WOE_MAX + else: + woe1 = math.log(rate_event / rate_non_event) + woe_dict[x1] = woe1 + iv += (rate_event - rate_non_event) * woe1 + return woe_dict, iv + + def woe_replace(self, X, woe_arr): + """ + replace the explanatory feature categories with its woe value + :param X: 2-D numpy array explanatory features which should be discreted already + :param woe_arr: numpy array of woe dictionaries, each dictionary contains woe values for categories of each feature + :return: the new numpy array in which woe values filled + """ + if X.shape[-1] != woe_arr.shape[-1]: + raise ValueError('WOE dict array length must be equal with features length') + + res = np.copy(X).astype(float) + idx = 0 + for woe_dict in woe_arr: + for k in woe_dict.keys(): + woe = woe_dict[k] + res[:, idx][np.where(res[:, idx] == k)[0]] = woe * 1.0 + idx += 1 + + return res + + def combined_iv(self, X, y, masks, event=1): + """ + calcute the information vlaue of combination features + :param X: 2-D numpy array explanatory features which should be discreted already + :param y: 1-D numpy array target variable + :param masks: 1-D numpy array of masks stands for which features are included in combination, + e.g. np.array([0,0,1,1,1,0,0,0,0,0,1]), the length should be same as features length + :param event: value of binary stands for the event to predict + :return: woe dictionary and information value of combined features + """ + if masks.shape[-1] != X.shape[-1]: + raise ValueError('Masks array length must be equal with features length') + + x = X[:, np.where(masks == 1)[0]] + tmp = [] + for i in range(x.shape[0]): + tmp.append(self.combine(x[i, :])) + + dumy = np.array(tmp) + # dumy_labels = np.unique(dumy) + woe, iv = self.woe_single_x(dumy, y, event) + return woe, iv + + def combine(self, list): + res = '' + for item in list: + res += str(item) + return res + + def count_binary(self, a, event=1): + event_count = (a == event).sum() + non_event_count = a.shape[-1] - event_count + return event_count, non_event_count + + def check_target_binary(self, y): + """ + check if the target variable is binary, raise error if not. + :param y: + :return: + """ + y_type = type_of_target(y) + if y_type not in ['binary']: + raise ValueError('Label type must be binary') + + def feature_discretion(self, X): + """ + Discrete the continuous features of input data X, and keep other features unchanged. + :param X : numpy array + :return: the numpy array in which all continuous features are discreted + """ + temp = [] + for i in range(0, X.shape[-1]): + x = X[:, i] + x_type = type_of_target(x) + if x_type == 'continuous': + x1 = self.discrete(x) + temp.append(x1) + else: + temp.append(x) + return np.array(temp).T + + def discrete(self, x): + """ + Discrete the input 1-D numpy array using 5 equal percentiles + :param x: 1-D numpy array + :return: discreted 1-D numpy array + """ + res = np.array([0] * x.shape[-1], dtype=int) + for i in range(5): + point1 = stats.scoreatpercentile(x, i * 20) + point2 = stats.scoreatpercentile(x, (i + 1) * 20) + x1 = x[np.where((x >= point1) & (x <= point2))] + mask = np.in1d(x, x1) + res[mask] = (i + 1) + return res + + def woe_feature(self,x,dict): + new_x = [] + for i in x: + new_x.append(dict[i]) + return new_x + + @property + def WOE_MIN(self): + return self._WOE_MIN + + @WOE_MIN.setter + def WOE_MIN(self, woe_min): + self._WOE_MIN = woe_min + + @property + def WOE_MAX(self): + return self._WOE_MAX + + @WOE_MAX.setter + def WOE_MAX(self, woe_max): + self._WOE_MAX = woe_max \ No newline at end of file diff --git a/binning.py b/binning.py index 7974076..c6a5d86 100644 --- a/binning.py +++ b/binning.py @@ -4,31 +4,31 @@ import pandas as pd -def equal_distance_binning(data, fea_name): +def equal_distance_binning(df, fea_name): """ 等距分箱 - :param data: + :param df: :param fea_name: :return: """ - pass + df[fea_name + '_d'] = pd.cut(df[fea_name]) -def equal_frequency_binning(data, fea_name): +def equal_frequency_binning(df, fea_name): """ 等频分箱 - :param data: + :param df: :param fea_name: :return: """ - pass + df[fea_name + '_f'] = pd.cut(df[fea_name]) -def auto_binning(data, fea_name): +def auto_binning(df, fea_name): """ 自动分箱 - :param data: + :param df: :param fea_name: :return: """ - pass + df[fea_name + '_a'] = pd.cut(df[fea_name]) From fa55d5524a8ff1f990a853e8698e0bc421fc4236 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Thu, 21 Jun 2018 14:08:12 +0800 Subject: [PATCH 18/49] add WOE --- WOE.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WOE.py b/WOE.py index 5858e2b..d76a88a 100644 --- a/WOE.py +++ b/WOE.py @@ -85,7 +85,7 @@ def woe_replace(self, X, woe_arr): def combined_iv(self, X, y, masks, event=1): """ - calcute the information vlaue of combination features + calcute the information value of combination features :param X: 2-D numpy array explanatory features which should be discreted already :param y: 1-D numpy array target variable :param masks: 1-D numpy array of masks stands for which features are included in combination, From 63c5e0eeebddb7348481e5162b71a91cf50abd3f Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Fri, 22 Jun 2018 08:48:12 +0800 Subject: [PATCH 19/49] add WOE --- WOE.py => woe.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) rename WOE.py => woe.py (95%) diff --git a/WOE.py b/woe.py similarity index 95% rename from WOE.py rename to woe.py index d76a88a..08e086b 100644 --- a/WOE.py +++ b/woe.py @@ -80,7 +80,6 @@ def woe_replace(self, X, woe_arr): woe = woe_dict[k] res[:, idx][np.where(res[:, idx] == k)[0]] = woe * 1.0 idx += 1 - return res def combined_iv(self, X, y, masks, event=1): @@ -179,4 +178,14 @@ def WOE_MAX(self): @WOE_MAX.setter def WOE_MAX(self, woe_max): - self._WOE_MAX = woe_max \ No newline at end of file + self._WOE_MAX = woe_max + +if __name__ == '__main__': + # path=input('Please input the file path: ') + path = 'iris.csv' + raw_data = pd.read_csv(path) + # print(raw_data) + woe=WOE() + # woe_result=woe.woe_single_x(x=raw_data,'SepalLength') + ret=pd.cut(raw_data['SepalLength'],5) + print(ret) \ No newline at end of file From 6c57c8282c938461c9a902c4399242244e6c6f3a Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Fri, 22 Jun 2018 13:25:44 +0800 Subject: [PATCH 20/49] add WOE --- binning.py | 29 +++++++++++++++++++++++------ main.py | 15 +++++++++------ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/binning.py b/binning.py index c6a5d86..593e16f 100644 --- a/binning.py +++ b/binning.py @@ -2,33 +2,50 @@ __author__ = 'xujia' import pandas as pd +import numpy as np +from scipy import stats -def equal_distance_binning(df, fea_name): +def equal_distance_binning(df, fea_name, bin_count): """ 等距分箱 :param df: :param fea_name: + :param bin_count :return: """ - df[fea_name + '_d'] = pd.cut(df[fea_name]) + df[fea_name + '_d'] = pd.cut(df[fea_name], bin_count) -def equal_frequency_binning(df, fea_name): +def equal_frequency_binning(df, fea_name, bin_count): """ 等频分箱 :param df: :param fea_name: + :param bin_count :return: """ - df[fea_name + '_f'] = pd.cut(df[fea_name]) + df[fea_name + '_f'] = pd.cut(df[fea_name], bin_count) -def auto_binning(df, fea_name): +def auto_binning(df, target_name, feature_name, max_bin_count): """ 自动分箱 :param df: :param fea_name: + :param bin_count :return: """ - df[fea_name + '_a'] = pd.cut(df[fea_name]) + r = 0 + good = df[target_name].sum() + bad = df[target_name].count() - good + while np.abs(r) < 1: + d1 = pd.DataFrame({'X': df[feature_name], 'Y': df[target_name], + 'Bucket': pd.qcut(df[feature_name], max_bin_count, duplicates='drop')}) + d2 = d1.groupby('Bucket', as_index=True) + r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) + max_bin_count = max_bin_count - 1 + woe = np.log((d2.mean().Y / (1 - d2.mean().Y)) / (good / bad)) + woe_dict = woe.to_dict() + woe_values = sorted(list(woe_dict.values())) + df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x]).replace(np.inf, woe_values[-2]) diff --git a/main.py b/main.py index f940fa7..81a8cad 100644 --- a/main.py +++ b/main.py @@ -3,6 +3,7 @@ import pandas as pd import numpy as np +import binning def file_info(file_path): @@ -73,10 +74,12 @@ def split_data(data_to_split): # path=input('Please input the file path: ') path = 'iris.csv' fea_dict, data = file_info(path) + data = data.fillna(0.0) + # change_type(data, fea_dict) + # print(data.dtypes) - change_type(data, fea_dict) - print(data.dtypes) - - t = split_data(data) - print(t[0].shape) - print(t[1].shape) + # t = split_data(data) + # print(t[0].shape) + # print(t[1].shape) + binning.auto_binning(data, 'Label', 'SepalLength', 10) + print(data) From eda25554ae4373c8cd59a3556d040a28790db4b6 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Fri, 22 Jun 2018 14:28:21 +0800 Subject: [PATCH 21/49] add WOE --- ARUtil.py | 14 +++++--------- binning.py | 5 ++++- main.py | 4 ++++ modeling.py | 21 +++++++++++++++++++++ 4 files changed, 34 insertions(+), 10 deletions(-) create mode 100644 modeling.py diff --git a/ARUtil.py b/ARUtil.py index d8e8df5..1cf4a9c 100755 --- a/ARUtil.py +++ b/ARUtil.py @@ -2,13 +2,10 @@ import pandas as pd import numpy as np import logging -# import sys -# reload(sys) -# sys.setdefaultencoding('utf8') +from scipy import stats class ARFilter(object): - def __init__(self, threshold=0.05, dest_var='y'): self.threshold = threshold self.dest_var = dest_var @@ -96,7 +93,7 @@ def cal_ks(self, data): bad_count += 1.0 else: good_count += 1.0 - val = abs(bad_count/total_bad_count - good_count/total_good_count) + val = abs(bad_count / total_bad_count - good_count / total_good_count) max_ks = max(max_ks, val) return max_ks @@ -154,7 +151,7 @@ def del_empty_value(self, data, empty_rate_threshold=0.5): for col in data.columns.values: if col == 'y': continue - empty_ratio = (data[col].shape[0] - data[col].count())/data[col].shape[0] + empty_ratio = (data[col].shape[0] - data[col].count()) / data[col].shape[0] if empty_ratio >= empty_rate_threshold: self.logger.info("变量:" + col + "缺失率为" + str(empty_ratio) + ",高于阈值:" + str(empty_rate_threshold)) data = data.drop(col, axis=1) @@ -204,7 +201,8 @@ def main(self): is_contain_empty_value, empty_col_list = self.is_contain_empty_value(file_dict) if is_contain_empty_value: self.logger.info("当前存在缺失值") - is_fill_empty = self.console_input(prompt="是否需要填充数据?1:是,其他值:否", if_value=["1"], else_value=[],if_rtn=True, else_rtn=False) + is_fill_empty = self.console_input(prompt="是否需要填充数据?1:是,其他值:否", if_value=["1"], else_value=[], + if_rtn=True, else_rtn=False) if is_fill_empty: for col in empty_col_list: fill_value = input("请输入列" + col + "待填充的数据:") @@ -228,6 +226,4 @@ def run(): if __name__ == "__main__": - run() - diff --git a/binning.py b/binning.py index 593e16f..f0c4675 100644 --- a/binning.py +++ b/binning.py @@ -48,4 +48,7 @@ def auto_binning(df, target_name, feature_name, max_bin_count): woe = np.log((d2.mean().Y / (1 - d2.mean().Y)) / (good / bad)) woe_dict = woe.to_dict() woe_values = sorted(list(woe_dict.values())) - df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x]).replace(np.inf, woe_values[-2]) + print(woe_values) + # 如果存在woe为inf情况,将其替换为不为inf的最大值加一 + df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x]).replace(np.inf, woe_values[-2] + 1).replace( + -np.inf, woe_values[1] - 1) diff --git a/main.py b/main.py index 81a8cad..7097255 100644 --- a/main.py +++ b/main.py @@ -4,6 +4,7 @@ import pandas as pd import numpy as np import binning +import modeling def file_info(file_path): @@ -82,4 +83,7 @@ def split_data(data_to_split): # print(t[0].shape) # print(t[1].shape) binning.auto_binning(data, 'Label', 'SepalLength', 10) + binning.auto_binning(data, 'Label', 'PetalLength', 10) + binning.auto_binning(data, 'Label', 'PetalWidth', 10) print(data) + print(modeling.model(data,['SepalLength_woe','PetalLength_woe','PetalWidth_woe'],'Label')) diff --git a/modeling.py b/modeling.py new file mode 100644 index 0000000..a185bf1 --- /dev/null +++ b/modeling.py @@ -0,0 +1,21 @@ +# -*- coding:utf-8 -*- +__author__ = 'xujia' +import numpy as np + +from sklearn.linear_model import LogisticRegression + + +def model(data, fea_list, target): + cls = LogisticRegression() + cls.fit(data[fea_list], data[target]) + print(cls.coef_) + print(cls.intercept_) + return cls + + +def score_trans(data, coef, intercept, scaled_value, odds, pdo): + a = (np.log(2 * odds) - np.log(odds)) / pdo + b = np.log(odds, np.e) - a * scaled_value + p = intercept + coef.dot(data) + score = np.log(p / (1 - p)) * a + b + return score From ebcc7379d020fbd28fa8da9fc37e931825e6f26c Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Fri, 22 Jun 2018 16:23:09 +0800 Subject: [PATCH 22/49] add WOE --- main.py | 10 ++++++---- modeling.py | 13 ++++++------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/main.py b/main.py index 7097255..ea946bf 100644 --- a/main.py +++ b/main.py @@ -61,7 +61,8 @@ def split_data(data_to_split): :param data_to_split:带分割数据 :return: (数据集1,数据集2) """ - ratio = float(input('请输入数据分割比例:')) + # ratio = float(input('请输入数据分割比例:')) + ratio = 0.8 data_count = data_to_split.shape[0] selected_count = int(data_count * ratio) if selected_count > 0: @@ -79,11 +80,12 @@ def split_data(data_to_split): # change_type(data, fea_dict) # print(data.dtypes) - # t = split_data(data) # print(t[0].shape) # print(t[1].shape) binning.auto_binning(data, 'Label', 'SepalLength', 10) binning.auto_binning(data, 'Label', 'PetalLength', 10) binning.auto_binning(data, 'Label', 'PetalWidth', 10) - print(data) - print(modeling.model(data,['SepalLength_woe','PetalLength_woe','PetalWidth_woe'],'Label')) + data1, data2 = split_data(data) + model = modeling.model(data1, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') + predict_score = modeling.score_trans(data2[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5,100, 10) + print(list(zip(data2['Label'].values, predict_score))) diff --git a/modeling.py b/modeling.py index a185bf1..59434e4 100644 --- a/modeling.py +++ b/modeling.py @@ -8,14 +8,13 @@ def model(data, fea_list, target): cls = LogisticRegression() cls.fit(data[fea_list], data[target]) - print(cls.coef_) - print(cls.intercept_) return cls -def score_trans(data, coef, intercept, scaled_value, odds, pdo): - a = (np.log(2 * odds) - np.log(odds)) / pdo - b = np.log(odds, np.e) - a * scaled_value - p = intercept + coef.dot(data) - score = np.log(p / (1 - p)) * a + b +def score_trans(data, model, p, scaled_value, pdo): + b = pdo / np.log(2) + a = scaled_value + b * np.log(p) + p = model.predict_proba(data)[:, 1] + score = a - np.log(p / (1 - p)) * b + return score From 10295547c13567d58986ddc24595d8fd581eeb77 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Fri, 22 Jun 2018 17:30:01 +0800 Subject: [PATCH 23/49] add WOE --- binning.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/binning.py b/binning.py index f0c4675..4c6144c 100644 --- a/binning.py +++ b/binning.py @@ -32,15 +32,17 @@ def auto_binning(df, target_name, feature_name, max_bin_count): """ 自动分箱 :param df: - :param fea_name: - :param bin_count + :param target_name: 目标变量名 + :param feature_name:特征变量名称 + :param max_bin_count:最大分箱数 :return: """ r = 0 good = df[target_name].sum() bad = df[target_name].count() - good while np.abs(r) < 1: - d1 = pd.DataFrame({'X': df[feature_name], 'Y': df[target_name], + d1 = pd.DataFrame({'X': df[feature_name], + 'Y': df[target_name], 'Bucket': pd.qcut(df[feature_name], max_bin_count, duplicates='drop')}) d2 = d1.groupby('Bucket', as_index=True) r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) @@ -50,5 +52,7 @@ def auto_binning(df, target_name, feature_name, max_bin_count): woe_values = sorted(list(woe_dict.values())) print(woe_values) # 如果存在woe为inf情况,将其替换为不为inf的最大值加一 - df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x]).replace(np.inf, woe_values[-2] + 1).replace( - -np.inf, woe_values[1] - 1) + df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x])\ + .replace(np.inf, woe_values[-2] + 1)\ + .replace(-np.inf, woe_values[1] - 1) + # return woe_dict From 08f54d5a2f79ce770345abefcde6ae93c55c3dfe Mon Sep 17 00:00:00 2001 From: GiantTao Date: Mon, 25 Jun 2018 14:02:30 +0800 Subject: [PATCH 24/49] ROC --- evaluate.py | 21 +++++++++++++++++++++ main.py | 13 +++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 evaluate.py diff --git a/evaluate.py b/evaluate.py new file mode 100644 index 0000000..a00afe6 --- /dev/null +++ b/evaluate.py @@ -0,0 +1,21 @@ +from sklearn import metrics +import matplotlib.pyplot as plt + + +def auc(model, test_data, fea_list, target): + predict_value = model.predict_proba(test_data[fea_list])[:, 1] + return metrics.roc_auc_score(test_data[target], predict_value) + + +def roc(model, test_data, fea_list, target): + predict_value = model.predict_proba(test_data[fea_list])[:, 1] + fpr, tpr, thresholds = metrics.roc_curve(test_data[target], predict_value) + roc_auc = metrics.auc(fpr, tpr) + plt.figure() + plt.plot(fpr, tpr, label='data1, AUC = %0.2f' % roc_auc) + plt.legend(loc=4) + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + plt.title("ROC Diagram") + plt.show() + diff --git a/main.py b/main.py index ea946bf..ae34431 100644 --- a/main.py +++ b/main.py @@ -4,6 +4,7 @@ import pandas as pd import numpy as np import binning +import evaluate import modeling @@ -85,7 +86,11 @@ def split_data(data_to_split): binning.auto_binning(data, 'Label', 'SepalLength', 10) binning.auto_binning(data, 'Label', 'PetalLength', 10) binning.auto_binning(data, 'Label', 'PetalWidth', 10) - data1, data2 = split_data(data) - model = modeling.model(data1, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') - predict_score = modeling.score_trans(data2[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5,100, 10) - print(list(zip(data2['Label'].values, predict_score))) + train_data, test_data = split_data(data) + model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') + predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5, 100, 10) + print(list(zip(test_data['Label'].values, predict_score))) + + auc = evaluate.auc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') + print("auc: " + str(auc)) + evaluate.roc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') From 638334c4c99572d2e33412c641841dda4727b84c Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Mon, 25 Jun 2018 14:08:02 +0800 Subject: [PATCH 25/49] Merge branch 'master' of /Users/yuguanghui/Documents/GitHub/ScoreCard with conflicts. --- main.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index ae34431..9a6e356 100644 --- a/main.py +++ b/main.py @@ -4,7 +4,6 @@ import pandas as pd import numpy as np import binning -import evaluate import modeling @@ -86,11 +85,7 @@ def split_data(data_to_split): binning.auto_binning(data, 'Label', 'SepalLength', 10) binning.auto_binning(data, 'Label', 'PetalLength', 10) binning.auto_binning(data, 'Label', 'PetalWidth', 10) - train_data, test_data = split_data(data) - model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') - predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5, 100, 10) - print(list(zip(test_data['Label'].values, predict_score))) - - auc = evaluate.auc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') - print("auc: " + str(auc)) - evaluate.roc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') + data1, data2 = split_data(data) + model = modeling.model(data1, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') + predict_score = modeling.score_trans(data2[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5, 100, 10) + print(list(zip(data2['Label'].values, predict_score))) From eaf928ede03656e0001225a3d7a12d53dc1d4a28 Mon Sep 17 00:00:00 2001 From: GiantTao Date: Mon, 25 Jun 2018 14:11:30 +0800 Subject: [PATCH 26/49] ROC --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index ae34431..98dd6b7 100644 --- a/main.py +++ b/main.py @@ -92,5 +92,5 @@ def split_data(data_to_split): print(list(zip(test_data['Label'].values, predict_score))) auc = evaluate.auc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') - print("auc: " + str(auc)) + print("au值: " + str(auc)) evaluate.roc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') From 59de5e796c1f8d41741bfa262f4ccb4015951d0b Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Mon, 25 Jun 2018 14:49:58 +0800 Subject: [PATCH 27/49] add WOE --- binning.py | 59 ++++++++++++++-- iris.csv | 200 ++++++++++++++++++++++++++--------------------------- 2 files changed, 155 insertions(+), 104 deletions(-) diff --git a/binning.py b/binning.py index 4c6144c..c38936c 100644 --- a/binning.py +++ b/binning.py @@ -52,7 +52,58 @@ def auto_binning(df, target_name, feature_name, max_bin_count): woe_values = sorted(list(woe_dict.values())) print(woe_values) # 如果存在woe为inf情况,将其替换为不为inf的最大值加一 - df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x])\ - .replace(np.inf, woe_values[-2] + 1)\ - .replace(-np.inf, woe_values[1] - 1) - # return woe_dict + df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x]) \ + .replace(np.inf, woe_values[-2] + 1) \ + .replace(-np.inf, woe_values[1] - 1) + + +def chi2(A): + ''' Compute the Chi-Square value ''' + m, k = A.shape # 行数 列数 + + R = A.sum(axis=1) # 行求和结果 + C = A.sum(axis=0) # 列求和结果 + N = A.sum() # 总和 + + res = 0 + for i in range(m): + for j in range(k): + Eij = 1.0 * R[i] * C[j] / N + if Eij != 0: + res = 1.0 * res + (A[i][j] - Eij) ** 2 / Eij + return res + + +def chi_merge(df, fea_name, target_name, dis_count): + fea_count = df[[fea_name, target_name]].copy().groupby([fea_name, target_name]).size().unstack().fillna(0.0) + while fea_count.shape[0] > dis_count: + chi_list = [] + for i in range(fea_count.shape[0] - 1): + chi_value = chi2(fea_count.iloc[i:i + 2].values) + chi_list.append([fea_count.index[i], chi_value]) + + chi_min_index = np.argmin(np.array(chi_list)[:, 1]) + if chi_min_index == len(chi_list) - 1: + current_fea = chi_list[chi_min_index][0] + fea_count.loc[current_fea] = fea_count.loc[current_fea:].sum(axis=0) + fea_count = fea_count.loc[:current_fea].copy() + else: + current_fea = chi_list[chi_min_index][0] + next_fea = chi_list[chi_min_index + 1][0] + fea_count.loc[current_fea] = fea_count.loc[current_fea] + fea_count.loc[next_fea] + fea_count.drop([next_fea], inplace=True) + chi_list.remove(chi_list[chi_min_index + 1]) + print(fea_count) + + +def discrete(path): + df = pd.read_csv(path) + target_name = df.columns[-1] + fea_names = df.columns[0:-1] + dis_count = 2 + for f in fea_names: + chi_merge(df, f, target_name, dis_count) + + +if __name__ == '__main__': + discrete('iris.csv') diff --git a/iris.csv b/iris.csv index a320fb3..517507e 100644 --- a/iris.csv +++ b/iris.csv @@ -1,101 +1,101 @@ SepalLength,SepalWidth,PetalLength,PetalWidth,Label -5.1,3.5,,0.2,0 -4.9,3.0,,0.2,0 -4.7,3.2,,0.2,0 -4.6,3.1,,0.2,0 -5.0,3.6,,0.2,0 -5.4,3.9,,0.4,0 -4.6,3.4,,0.3,0 -5.0,3.4,1.5,0.2,0 -4.4,2.9,1.4,0.2,0 -4.9,3.1,1.5,0.1,0 -5.4,3.7,1.5,0.2,0 -4.8,3.4,1.6,0.2,0 -4.8,3.0,1.4,0.1,0 -4.3,,1.1,0.1,0 -5.8,4.0,1.2,0.2,0 -5.7,4.4,1.5,0.4,0 -5.4,3.9,1.3,0.4,0 -5.1,3.5,1.4,0.3,0 -5.7,3.8,1.7,0.3,0 -5.1,3.8,1.5,0.3,0 -5.4,3.4,1.7,0.2,0 -5.1,3.7,1.5,0.4,0 -4.6,3.6,1.0,0.2,0 -5.1,3.3,1.7,0.5,0 -4.8,3.4,1.9,0.2,0 -5.0,3.0,1.6,0.2,0 -5.0,3.4,1.6,0.4,0 -5.2,3.5,1.5,0.2,0 -5.2,3.4,1.4,0.2,0 -4.7,3.2,1.6,0.2,0 -4.8,3.1,1.6,0.2,0 -5.4,3.4,1.5,0.4,0 -5.2,4.1,1.5,0.1,0 -5.5,4.2,1.4,0.2,0 -4.9,3.1,1.5,0.1,0 -5.0,3.2,1.2,0.2,0 -5.5,3.5,1.3,0.2,0 -4.9,3.1,1.5,0.1,0 -4.4,3.0,1.3,0.2,0 -5.1,3.4,1.5,0.2,0 -5.0,3.5,1.3,0.3,0 -4.5,2.3,1.3,0.3,0 -4.4,3.2,1.3,0.2,0 -5.0,3.5,1.6,0.6,0 -5.1,3.8,1.9,0.4,0 -4.8,3.0,1.4,0.3,0 -5.1,3.8,1.6,0.2,0 -4.6,3.2,1.4,0.2,0 -5.3,3.7,1.5,0.2,0 -5.0,3.3,1.4,0.2,0 -7.0,3.2,4.7,1.4,1 -6.4,3.2,4.5,1.5,1 -6.9,3.1,4.9,1.5,1 -5.5,2.3,4.0,1.3,1 -6.5,2.8,4.6,1.5,1 -5.7,2.8,4.5,1.3,1 -6.3,3.3,4.7,1.6,1 -4.9,2.4,3.3,1.0,1 -6.6,2.9,4.6,1.3,1 -5.2,2.7,3.9,1.4,1 -5.0,2.0,3.5,1.0,1 -5.9,3.0,4.2,1.5,1 -6.0,2.2,4.0,1.0,1 -6.1,2.9,4.7,1.4,1 -5.6,2.9,3.6,1.3,1 -6.7,3.1,4.4,1.4,1 -5.6,3.0,4.5,1.5,1 -5.8,2.7,4.1,1.0,1 -6.2,2.2,4.5,1.5,1 -5.6,2.5,3.9,1.1,1 -5.9,3.2,4.8,1.8,1 -6.1,2.8,4.0,1.3,1 -6.3,2.5,4.9,1.5,1 -6.1,2.8,4.7,1.2,1 -6.4,2.9,4.3,1.3,1 -6.6,3.0,4.4,1.4,1 -6.8,2.8,4.8,1.4,1 -6.7,3.0,5.0,1.7,1 -6.0,2.9,4.5,1.5,1 -5.7,2.6,3.5,1.0,1 -5.5,2.4,3.8,1.1,1 -5.5,2.4,3.7,1.0,1 -5.8,2.7,3.9,1.2,1 -6.0,2.7,5.1,1.6,1 -5.4,3.0,4.5,1.5,1 -6.0,3.4,4.5,1.6,1 -6.7,3.1,4.7,1.5,1 -6.3,2.3,4.4,1.3,1 -5.6,3.0,4.1,1.3,1 -5.5,2.5,4.0,1.3,1 -5.5,2.6,4.4,1.2,1 -6.1,3.0,4.6,1.4,1 -5.8,2.6,4.0,1.2,1 -5.0,2.3,3.3,1.0,1 -5.6,2.7,4.2,1.3,1 -5.7,3.0,4.2,1.2,1 -5.7,2.9,4.2,1.3,1 -6.2,2.9,4.3,1.3,1 -5.1,2.5,3.0,1.1,1 -5.7,2.8,4.1,1.3,1 \ No newline at end of file +5.1,3.5,1.4,0.2,0.0 +4.9,3.0,1.4,0.2,0.0 +4.7,3.2,1.3,0.2,0.0 +4.6,3.1,1.5,0.2,0.0 +5.0,3.6,1.4,0.2,0.0 +5.4,3.9,1.7,0.4,0.0 +4.6,3.4,1.4,0.3,0.0 +5.0,3.4,1.5,0.2,0.0 +4.4,2.9,1.4,0.2,0.0 +4.9,3.1,1.5,0.1,0.0 +5.4,3.7,1.5,0.2,0.0 +4.8,3.4,1.6,0.2,0.0 +4.8,3.0,1.4,0.1,0.0 +4.3,3.0,1.1,0.1,0.0 +5.8,4.0,1.2,0.2,0.0 +5.7,4.4,1.5,0.4,0.0 +5.4,3.9,1.3,0.4,0.0 +5.1,3.5,1.4,0.3,0.0 +5.7,3.8,1.7,0.3,0.0 +5.1,3.8,1.5,0.3,0.0 +5.4,3.4,1.7,0.2,0.0 +5.1,3.7,1.5,0.4,0.0 +4.6,3.6,1.0,0.2,0.0 +5.1,3.3,1.7,0.5,0.0 +4.8,3.4,1.9,0.2,0.0 +5.0,3.0,1.6,0.2,0.0 +5.0,3.4,1.6,0.4,0.0 +5.2,3.5,1.5,0.2,0.0 +5.2,3.4,1.4,0.2,0.0 +4.7,3.2,1.6,0.2,0.0 +4.8,3.1,1.6,0.2,0.0 +5.4,3.4,1.5,0.4,0.0 +5.2,4.1,1.5,0.1,0.0 +5.5,4.2,1.4,0.2,0.0 +4.9,3.1,1.5,0.1,0.0 +5.0,3.2,1.2,0.2,0.0 +5.5,3.5,1.3,0.2,0.0 +4.9,3.1,1.5,0.1,0.0 +4.4,3.0,1.3,0.2,0.0 +5.1,3.4,1.5,0.2,0.0 +5.0,3.5,1.3,0.3,0.0 +4.5,2.3,1.3,0.3,0.0 +4.4,3.2,1.3,0.2,0.0 +5.0,3.5,1.6,0.6,0.0 +5.1,3.8,1.9,0.4,0.0 +4.8,3.0,1.4,0.3,0.0 +5.1,3.8,1.6,0.2,0.0 +4.6,3.2,1.4,0.2,0.0 +5.3,3.7,1.5,0.2,0.0 +5.0,3.3,1.4,0.2,0.0 +7.0,3.2,4.7,1.4,1.0 +6.4,3.2,4.5,1.5,1.0 +6.9,3.1,4.9,1.5,1.0 +5.5,2.3,4.0,1.3,1.0 +6.5,2.8,4.6,1.5,1.0 +5.7,2.8,4.5,1.3,1.0 +6.3,3.3,4.7,1.6,1.0 +4.9,2.4,3.3,1.0,1.0 +6.6,2.9,4.6,1.3,1.0 +5.2,2.7,3.9,1.4,1.0 +5.0,2.0,3.5,1.0,1.0 +5.9,3.0,4.2,1.5,1.0 +6.0,2.2,4.0,1.0,1.0 +6.1,2.9,4.7,1.4,1.0 +5.6,2.9,3.6,1.3,1.0 +6.7,3.1,4.4,1.4,1.0 +5.6,3.0,4.5,1.5,1.0 +5.8,2.7,4.1,1.0,1.0 +6.2,2.2,4.5,1.5,1.0 +5.6,2.5,3.9,1.1,1.0 +5.9,3.2,4.8,1.8,1.0 +6.1,2.8,4.0,1.3,1.0 +6.3,2.5,4.9,1.5,1.0 +6.1,2.8,4.7,1.2,1.0 +6.4,2.9,4.3,1.3,1.0 +6.6,3.0,4.4,1.4,1.0 +6.8,2.8,4.8,1.4,1.0 +6.7,3.0,5.0,1.7,1.0 +6.0,2.9,4.5,1.5,1.0 +5.7,2.6,3.5,1.0,1.0 +5.5,2.4,3.8,1.1,1.0 +5.5,2.4,3.7,1.0,1.0 +5.8,2.7,3.9,1.2,1.0 +6.0,2.7,5.1,1.6,1.0 +5.4,3.0,4.5,1.5,1.0 +6.0,3.4,4.5,1.6,1.0 +6.7,3.1,4.7,1.5,1.0 +6.3,2.3,4.4,1.3,1.0 +5.6,3.0,4.1,1.3,1.0 +5.5,2.5,4.0,1.3,1.0 +5.5,2.6,4.4,1.2,1.0 +6.1,3.0,4.6,1.4,1.0 +5.8,2.6,4.0,1.2,1.0 +5.0,2.3,3.3,1.0,1.0 +5.6,2.7,4.2,1.3,1.0 +5.7,3.0,4.2,1.2,1.0 +5.7,2.9,4.2,1.3,1.0 +6.2,2.9,4.3,1.3,1.0 +5.1,2.5,3.0,1.1,1.0 +5.7,2.8,4.1,1.3,1.0 \ No newline at end of file From 49956803f0514808b363361b06fa8b2cac5e316d Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Mon, 25 Jun 2018 16:12:57 +0800 Subject: [PATCH 28/49] add WOE --- evaluate.py | 33 +++++++++++++++++++++++++-------- main.py | 7 ++++--- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/evaluate.py b/evaluate.py index a00afe6..575f24f 100644 --- a/evaluate.py +++ b/evaluate.py @@ -1,15 +1,33 @@ -from sklearn import metrics +# -*-coding:utf-8 -*- + +from sklearn import metrics import matplotlib.pyplot as plt -def auc(model, test_data, fea_list, target): - predict_value = model.predict_proba(test_data[fea_list])[:, 1] - return metrics.roc_auc_score(test_data[target], predict_value) +def auc(model, test_data): + """ + + :param model: + :param test_data: + :param fea_list: + :param target: + :return: + """ + predict_value = model.predict_proba(test_data.ix[:,0:-1])[:, 1] + return metrics.roc_auc_score(test_data.ix[:,-1], predict_value) -def roc(model, test_data, fea_list, target): - predict_value = model.predict_proba(test_data[fea_list])[:, 1] - fpr, tpr, thresholds = metrics.roc_curve(test_data[target], predict_value) +def roc(model, test_data): + """ + + :param model: + :param test_data: + :param fea_list: + :param target: + :return: + """ + predict_value = model.predict_proba(test_data.ix[:,0:-1])[:, 1] + fpr, tpr, thresholds = metrics.roc_curve(test_data.ix[:,-1], predict_value) roc_auc = metrics.auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, label='data1, AUC = %0.2f' % roc_auc) @@ -18,4 +36,3 @@ def roc(model, test_data, fea_list, target): plt.ylabel("True Positive Rate") plt.title("ROC Diagram") plt.show() - diff --git a/main.py b/main.py index 98dd6b7..fa40647 100644 --- a/main.py +++ b/main.py @@ -88,9 +88,10 @@ def split_data(data_to_split): binning.auto_binning(data, 'Label', 'PetalWidth', 10) train_data, test_data = split_data(data) model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') - predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5, 100, 10) + predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, + 0.5, 100, 10) print(list(zip(test_data['Label'].values, predict_score))) - auc = evaluate.auc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') + auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) print("au值: " + str(auc)) - evaluate.roc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') + evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) From 479481fb7fc564ad7bbbd41f16a5f0dd44871afa Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Tue, 26 Jun 2018 18:37:28 +0800 Subject: [PATCH 29/49] add WOE --- evaluate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/evaluate.py b/evaluate.py index 575f24f..0c84e84 100644 --- a/evaluate.py +++ b/evaluate.py @@ -13,8 +13,8 @@ def auc(model, test_data): :param target: :return: """ - predict_value = model.predict_proba(test_data.ix[:,0:-1])[:, 1] - return metrics.roc_auc_score(test_data.ix[:,-1], predict_value) + predict_value = model.predict_proba(test_data.ix[:, 0:-1])[:, 1] + return metrics.roc_auc_score(test_data.ix[:, -1], predict_value) def roc(model, test_data): @@ -26,8 +26,8 @@ def roc(model, test_data): :param target: :return: """ - predict_value = model.predict_proba(test_data.ix[:,0:-1])[:, 1] - fpr, tpr, thresholds = metrics.roc_curve(test_data.ix[:,-1], predict_value) + predict_value = model.predict_proba(test_data.ix[:, 0:-1])[:, 1] + fpr, tpr, thresholds = metrics.roc_curve(test_data.ix[:, -1], predict_value) roc_auc = metrics.auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, label='data1, AUC = %0.2f' % roc_auc) From f9cf842e3035e2f9af4100682f4d3db40fc29fbd Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Thu, 28 Jun 2018 08:56:55 +0800 Subject: [PATCH 30/49] add WOE --- binning.py | 17 +++++++++++++++-- evaluate.py | 16 ++++++---------- main.py | 41 ++++++++++++++++++++++++++++++----------- 3 files changed, 51 insertions(+), 23 deletions(-) diff --git a/binning.py b/binning.py index c38936c..a83f09a 100644 --- a/binning.py +++ b/binning.py @@ -28,7 +28,7 @@ def equal_frequency_binning(df, fea_name, bin_count): df[fea_name + '_f'] = pd.cut(df[fea_name], bin_count) -def auto_binning(df, target_name, feature_name, max_bin_count): +def auto_binning(df, feature_name, target_name, max_bin_count): """ 自动分箱 :param df: @@ -58,7 +58,11 @@ def auto_binning(df, target_name, feature_name, max_bin_count): def chi2(A): - ''' Compute the Chi-Square value ''' + """ + 计算卡方值 + :param A:需要计算卡方的两行数据 + :return: 卡方值 + """ m, k = A.shape # 行数 列数 R = A.sum(axis=1) # 行求和结果 @@ -75,6 +79,14 @@ def chi2(A): def chi_merge(df, fea_name, target_name, dis_count): + """ + chiMerge的主算法 + :param df:数据,dataframe格式 + :param fea_name:需要进行分段的特征名称 + :param target_name:目标变量名称 + :param dis_count:最大分组数 + :return: 分割点 + """ fea_count = df[[fea_name, target_name]].copy().groupby([fea_name, target_name]).size().unstack().fillna(0.0) while fea_count.shape[0] > dis_count: chi_list = [] @@ -94,6 +106,7 @@ def chi_merge(df, fea_name, target_name, dis_count): fea_count.drop([next_fea], inplace=True) chi_list.remove(chi_list[chi_min_index + 1]) print(fea_count) + return fea_count def discrete(path): diff --git a/evaluate.py b/evaluate.py index 0c84e84..9b96214 100644 --- a/evaluate.py +++ b/evaluate.py @@ -7,11 +7,9 @@ def auc(model, test_data): """ - :param model: - :param test_data: - :param fea_list: - :param target: - :return: + :param model:模型 + :param test_data:测试数据,dataframe格式,第一列至倒数第二列为特征字段,最后一列为目标字段 + :return:auc值 """ predict_value = model.predict_proba(test_data.ix[:, 0:-1])[:, 1] return metrics.roc_auc_score(test_data.ix[:, -1], predict_value) @@ -20,11 +18,9 @@ def auc(model, test_data): def roc(model, test_data): """ - :param model: - :param test_data: - :param fea_list: - :param target: - :return: + :param model:模型 + :param test_data:测试数据,dataframe格式,第一列至倒数第二列为特征字段,最后一列为目标字段 + :return:roc曲线 """ predict_value = model.predict_proba(test_data.ix[:, 0:-1])[:, 1] fpr, tpr, thresholds = metrics.roc_curve(test_data.ix[:, -1], predict_value) diff --git a/main.py b/main.py index fa40647..739fa2b 100644 --- a/main.py +++ b/main.py @@ -6,6 +6,8 @@ import binning import evaluate import modeling +import woe +import math def file_info(file_path): @@ -83,15 +85,32 @@ def split_data(data_to_split): # print(t[0].shape) # print(t[1].shape) - binning.auto_binning(data, 'Label', 'SepalLength', 10) - binning.auto_binning(data, 'Label', 'PetalLength', 10) - binning.auto_binning(data, 'Label', 'PetalWidth', 10) - train_data, test_data = split_data(data) - model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') - predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, - 0.5, 100, 10) - print(list(zip(test_data['Label'].values, predict_score))) + # binning.auto_binning(data, 'SepalLength','Label', 10) + # binning.auto_binning(data, 'PetalLength','Label', 10) + # binning.auto_binning(data, 'PetalWidth','Label', 10) + # train_data, test_data = split_data(data) + # model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') + # predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, + # 0.5, 100, 10) + # print(list(zip(test_data['Label'].values, predict_score))) + # + # auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) + # print("au值: " + str(auc)) + # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) - auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) - print("au值: " + str(auc)) - evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) + bins = binning.chi_merge(data, 'SepalLength', 'Label', 5) + bin_index = bins.index.values.astype(float).copy() + bin_index[0] = -np.inf + bin_index = np.append(bin_index, np.inf) + interval_list = [] + woe_list = [] + for i in range(len(bin_index) - 1): + if bin_index[i] == bin_index[i + 1]: + continue + else: + interval_list.append(pd.Interval(left=bin_index[i], right=bin_index[i + 1], closed='left')) + woe_list.append( + math.log((bins[0.0][bin_index[i]] / bins[0.0].sum()) / (bins[1.0][bin_index[i]] / bins[1.0].sum()))) + print(interval_list, woe_list) + + print(bins) From c79eb159dadc6ed5a9b8d2a94c6415d00d7207b6 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Thu, 28 Jun 2018 08:57:19 +0800 Subject: [PATCH 31/49] add WOE --- main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/main.py b/main.py index 739fa2b..8238aec 100644 --- a/main.py +++ b/main.py @@ -112,5 +112,4 @@ def split_data(data_to_split): woe_list.append( math.log((bins[0.0][bin_index[i]] / bins[0.0].sum()) / (bins[1.0][bin_index[i]] / bins[1.0].sum()))) print(interval_list, woe_list) - print(bins) From 3e4b5ccac4178fd9b3450f416b238cf1332ba7b4 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Thu, 28 Jun 2018 09:35:59 +0800 Subject: [PATCH 32/49] add WOE --- main.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index 8238aec..3df2695 100644 --- a/main.py +++ b/main.py @@ -99,17 +99,27 @@ def split_data(data_to_split): # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) bins = binning.chi_merge(data, 'SepalLength', 'Label', 5) - bin_index = bins.index.values.astype(float).copy() - bin_index[0] = -np.inf + bin_index = bins.index.values.astype(float) + # bin_index[0] = -np.inf bin_index = np.append(bin_index, np.inf) interval_list = [] woe_list = [] + max_woe = 20 + min_woe = -20 for i in range(len(bin_index) - 1): if bin_index[i] == bin_index[i + 1]: continue else: interval_list.append(pd.Interval(left=bin_index[i], right=bin_index[i + 1], closed='left')) - woe_list.append( - math.log((bins[0.0][bin_index[i]] / bins[0.0].sum()) / (bins[1.0][bin_index[i]] / bins[1.0].sum()))) - print(interval_list, woe_list) + rate_event = bins[0.0][bin_index[i]] / bins[0.0].sum() + rate_non_event = bins[1.0][bin_index[i]] / bins[1.0].sum() + if rate_event == 0.0: + woe_list.append(min_woe) + elif rate_non_event == 0.0: + woe_list.append(max_woe) + else: + woe_list.append( + math.log((bins[0.0][bin_index[i]] / bins[0.0].sum()) / (bins[1.0][bin_index[i]] / bins[1.0].sum()))) + bins['interval'] = interval_list + bins['woe'] = woe_list print(bins) From fd41b88dbdcbc9813698180ab22b793bc45abc90 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Thu, 28 Jun 2018 09:46:43 +0800 Subject: [PATCH 33/49] add WOE --- main.py | 26 +------------------------- woe.py | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 29 deletions(-) diff --git a/main.py b/main.py index 3df2695..6b87dca 100644 --- a/main.py +++ b/main.py @@ -97,29 +97,5 @@ def split_data(data_to_split): # auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) # print("au值: " + str(auc)) # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) - bins = binning.chi_merge(data, 'SepalLength', 'Label', 5) - bin_index = bins.index.values.astype(float) - # bin_index[0] = -np.inf - bin_index = np.append(bin_index, np.inf) - interval_list = [] - woe_list = [] - max_woe = 20 - min_woe = -20 - for i in range(len(bin_index) - 1): - if bin_index[i] == bin_index[i + 1]: - continue - else: - interval_list.append(pd.Interval(left=bin_index[i], right=bin_index[i + 1], closed='left')) - rate_event = bins[0.0][bin_index[i]] / bins[0.0].sum() - rate_non_event = bins[1.0][bin_index[i]] / bins[1.0].sum() - if rate_event == 0.0: - woe_list.append(min_woe) - elif rate_non_event == 0.0: - woe_list.append(max_woe) - else: - woe_list.append( - math.log((bins[0.0][bin_index[i]] / bins[0.0].sum()) / (bins[1.0][bin_index[i]] / bins[1.0].sum()))) - bins['interval'] = interval_list - bins['woe'] = woe_list - print(bins) + woe.my_woe(bins) diff --git a/woe.py b/woe.py index 08e086b..08ae216 100644 --- a/woe.py +++ b/woe.py @@ -158,7 +158,7 @@ def discrete(self, x): res[mask] = (i + 1) return res - def woe_feature(self,x,dict): + def woe_feature(self, x, dict): new_x = [] for i in x: new_x.append(dict[i]) @@ -180,12 +180,39 @@ def WOE_MAX(self): def WOE_MAX(self, woe_max): self._WOE_MAX = woe_max + +def my_woe(bins): + bin_index = bins.index.values.astype(float) + # bin_index[0] = -np.inf + bin_index = np.append(bin_index, np.inf) + interval_list = [] + woe_list = [] + max_woe = 20 + min_woe = -20 + for i in range(len(bin_index) - 1): + if bin_index[i] == bin_index[i + 1]: + continue + else: + interval_list.append(pd.Interval(left=bin_index[i], right=bin_index[i + 1], closed='left')) + rate_event = bins[0.0][bin_index[i]] / bins[0.0].sum() + rate_non_event = bins[1.0][bin_index[i]] / bins[1.0].sum() + if rate_event == 0.0: + woe_list.append(min_woe) + elif rate_non_event == 0.0: + woe_list.append(max_woe) + else: + woe_list.append(math.log(rate_event / rate_non_event)) + bins['interval'] = interval_list + bins['woe'] = woe_list + print(bins) + + if __name__ == '__main__': # path=input('Please input the file path: ') path = 'iris.csv' raw_data = pd.read_csv(path) # print(raw_data) - woe=WOE() + woe = WOE() # woe_result=woe.woe_single_x(x=raw_data,'SepalLength') - ret=pd.cut(raw_data['SepalLength'],5) - print(ret) \ No newline at end of file + ret = pd.cut(raw_data['SepalLength'], 5) + print(ret) From 6ad5eb031acec968198a8790d214b58613a0f524 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Fri, 29 Jun 2018 16:30:45 +0800 Subject: [PATCH 34/49] add WOE --- main.py | 5 ++++- woe.py | 7 ++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 6b87dca..bf47c83 100644 --- a/main.py +++ b/main.py @@ -98,4 +98,7 @@ def split_data(data_to_split): # print("au值: " + str(auc)) # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) bins = binning.chi_merge(data, 'SepalLength', 'Label', 5) - woe.my_woe(bins) + + bin_woe = woe.my_woe(bins) + data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf])) + print(data.head(20)) diff --git a/woe.py b/woe.py index 08ae216..abdf403 100644 --- a/woe.py +++ b/woe.py @@ -183,12 +183,12 @@ def WOE_MAX(self, woe_max): def my_woe(bins): bin_index = bins.index.values.astype(float) - # bin_index[0] = -np.inf + bin_index[0] = -np.inf bin_index = np.append(bin_index, np.inf) interval_list = [] woe_list = [] - max_woe = 20 - min_woe = -20 + max_woe = 10 + min_woe = -10 for i in range(len(bin_index) - 1): if bin_index[i] == bin_index[i + 1]: continue @@ -205,6 +205,7 @@ def my_woe(bins): bins['interval'] = interval_list bins['woe'] = woe_list print(bins) + return dict(zip(interval_list, woe_list)) if __name__ == '__main__': From b45c3c3725d771a28e9eb04be470c8e3d515db6a Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Fri, 29 Jun 2018 18:14:40 +0800 Subject: [PATCH 35/49] add WOE --- binning.py | 1 - main.py | 9 ++++++--- woe.py | 34 +++++++++++++++++++--------------- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/binning.py b/binning.py index a83f09a..f64c18a 100644 --- a/binning.py +++ b/binning.py @@ -105,7 +105,6 @@ def chi_merge(df, fea_name, target_name, dis_count): fea_count.loc[current_fea] = fea_count.loc[current_fea] + fea_count.loc[next_fea] fea_count.drop([next_fea], inplace=True) chi_list.remove(chi_list[chi_min_index + 1]) - print(fea_count) return fea_count diff --git a/main.py b/main.py index bf47c83..17d69d6 100644 --- a/main.py +++ b/main.py @@ -8,6 +8,8 @@ import modeling import woe import math +from pandas import Interval +from numpy import inf def file_info(file_path): @@ -99,6 +101,7 @@ def split_data(data_to_split): # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) bins = binning.chi_merge(data, 'SepalLength', 'Label', 5) - bin_woe = woe.my_woe(bins) - data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf])) - print(data.head(20)) + bin_woe = woe.my_woe(data,bins) + # data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf])).astype(str) + # data[bins.index.name + '_woe'] = data[bins.index.name + '_bin'].apply(lambda x: bin_woe[x]) + print(data) diff --git a/woe.py b/woe.py index abdf403..f2ce8af 100644 --- a/woe.py +++ b/woe.py @@ -181,9 +181,12 @@ def WOE_MAX(self, woe_max): self._WOE_MAX = woe_max -def my_woe(bins): +def my_woe(data,bins): + fea_name = bins.index.name bin_index = bins.index.values.astype(float) bin_index[0] = -np.inf + bins.index = bin_index + bins.index.name = fea_name bin_index = np.append(bin_index, np.inf) interval_list = [] woe_list = [] @@ -193,7 +196,7 @@ def my_woe(bins): if bin_index[i] == bin_index[i + 1]: continue else: - interval_list.append(pd.Interval(left=bin_index[i], right=bin_index[i + 1], closed='left')) + interval_list.append('('+str(bin_index[i])+', '+str(bin_index[i + 1])+']') rate_event = bins[0.0][bin_index[i]] / bins[0.0].sum() rate_non_event = bins[1.0][bin_index[i]] / bins[1.0].sum() if rate_event == 0.0: @@ -204,16 +207,17 @@ def my_woe(bins): woe_list.append(math.log(rate_event / rate_non_event)) bins['interval'] = interval_list bins['woe'] = woe_list - print(bins) - return dict(zip(interval_list, woe_list)) - - -if __name__ == '__main__': - # path=input('Please input the file path: ') - path = 'iris.csv' - raw_data = pd.read_csv(path) - # print(raw_data) - woe = WOE() - # woe_result=woe.woe_single_x(x=raw_data,'SepalLength') - ret = pd.cut(raw_data['SepalLength'], 5) - print(ret) + bin_woe=dict(zip(interval_list, woe_list)) + data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf])).astype(str) + data[bins.index.name + '_woe'] = data[bins.index.name + '_bin'].apply(lambda x: bin_woe[x]) + + +# if __name__ == '__main__': +# path=input('Please input the file path: ') +# path = 'iris.csv' +# raw_data = pd.read_csv(path) +# print(raw_data) +# woe = WOE() +# woe_result=woe.woe_single_x(x=raw_data,'SepalLength') +# ret = pd.cut(raw_data['SepalLength'], 5) +# print(ret) From ecbbc011631410fe1353607f82399ec8561d2af3 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Fri, 29 Jun 2018 18:16:44 +0800 Subject: [PATCH 36/49] add WOE --- binning.py | 1 - main.py | 6 ++---- woe.py | 10 +++++----- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/binning.py b/binning.py index f64c18a..291e78f 100644 --- a/binning.py +++ b/binning.py @@ -93,7 +93,6 @@ def chi_merge(df, fea_name, target_name, dis_count): for i in range(fea_count.shape[0] - 1): chi_value = chi2(fea_count.iloc[i:i + 2].values) chi_list.append([fea_count.index[i], chi_value]) - chi_min_index = np.argmin(np.array(chi_list)[:, 1]) if chi_min_index == len(chi_list) - 1: current_fea = chi_list[chi_min_index][0] diff --git a/main.py b/main.py index 17d69d6..69c12b9 100644 --- a/main.py +++ b/main.py @@ -99,9 +99,7 @@ def split_data(data_to_split): # auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) # print("au值: " + str(auc)) # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) - bins = binning.chi_merge(data, 'SepalLength', 'Label', 5) - bin_woe = woe.my_woe(data,bins) - # data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf])).astype(str) - # data[bins.index.name + '_woe'] = data[bins.index.name + '_bin'].apply(lambda x: bin_woe[x]) + bins = binning.chi_merge(data, 'SepalLength', 'Label', 5) + bin_woe = woe.add_woe_col(data, bins) print(data) diff --git a/woe.py b/woe.py index f2ce8af..ac516b7 100644 --- a/woe.py +++ b/woe.py @@ -181,7 +181,7 @@ def WOE_MAX(self, woe_max): self._WOE_MAX = woe_max -def my_woe(data,bins): +def add_woe_col(data, bins): fea_name = bins.index.name bin_index = bins.index.values.astype(float) bin_index[0] = -np.inf @@ -196,7 +196,7 @@ def my_woe(data,bins): if bin_index[i] == bin_index[i + 1]: continue else: - interval_list.append('('+str(bin_index[i])+', '+str(bin_index[i + 1])+']') + interval_list.append('(' + str(bin_index[i]) + ', ' + str(bin_index[i + 1]) + ']') rate_event = bins[0.0][bin_index[i]] / bins[0.0].sum() rate_non_event = bins[1.0][bin_index[i]] / bins[1.0].sum() if rate_event == 0.0: @@ -207,11 +207,11 @@ def my_woe(data,bins): woe_list.append(math.log(rate_event / rate_non_event)) bins['interval'] = interval_list bins['woe'] = woe_list - bin_woe=dict(zip(interval_list, woe_list)) - data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf])).astype(str) + bin_woe = dict(zip(interval_list, woe_list)) + data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf])).astype( + str) data[bins.index.name + '_woe'] = data[bins.index.name + '_bin'].apply(lambda x: bin_woe[x]) - # if __name__ == '__main__': # path=input('Please input the file path: ') # path = 'iris.csv' From 23ee1ef06167883b1b3eafd6d58defc30fa96a54 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Fri, 29 Jun 2018 18:25:43 +0800 Subject: [PATCH 37/49] add WOE --- woe.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/woe.py b/woe.py index ac516b7..7bb40c0 100644 --- a/woe.py +++ b/woe.py @@ -182,6 +182,12 @@ def WOE_MAX(self, woe_max): def add_woe_col(data, bins): + """ + 为指定特征添加一列对应的WOE值 + :param data:原始数据 + :param bins:分段信息 + :return:在原始数据上添加一列 + """ fea_name = bins.index.name bin_index = bins.index.values.astype(float) bin_index[0] = -np.inf @@ -205,12 +211,10 @@ def add_woe_col(data, bins): woe_list.append(max_woe) else: woe_list.append(math.log(rate_event / rate_non_event)) - bins['interval'] = interval_list - bins['woe'] = woe_list bin_woe = dict(zip(interval_list, woe_list)) - data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf])).astype( - str) - data[bins.index.name + '_woe'] = data[bins.index.name + '_bin'].apply(lambda x: bin_woe[x]) + data[fea_name + '_bin'] = pd.cut(data[fea_name], bins=np.append(bins.index.values, [np.inf])).astype(str) + data[fea_name + '_woe'] = data[fea_name + '_bin'].apply(lambda x: bin_woe[x]) + del data[fea_name + '_bin'] # if __name__ == '__main__': # path=input('Please input the file path: ') From c541709804707ee2dc0263efcd0034f24275efea Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Fri, 29 Jun 2018 18:33:57 +0800 Subject: [PATCH 38/49] add WOE --- binning.py | 38 +++++++++++++++++++------------------- woe.py | 1 - 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/binning.py b/binning.py index 291e78f..f4d01d0 100644 --- a/binning.py +++ b/binning.py @@ -6,7 +6,7 @@ from scipy import stats -def equal_distance_binning(df, fea_name, bin_count): +def equal_distance_binning(df, fea_name, target_name, bin_count): """ 等距分箱 :param df: @@ -15,9 +15,12 @@ def equal_distance_binning(df, fea_name, bin_count): :return: """ df[fea_name + '_d'] = pd.cut(df[fea_name], bin_count) + fea_count = df[[fea_name + '_d', target_name]].copy().groupby( + [fea_name + '_d', target_name]).size().unstack().fillna(0.0) + return fea_count -def equal_frequency_binning(df, fea_name, bin_count): +def equal_frequency_binning(df, fea_name, target_name, bin_count): """ 等频分箱 :param df: @@ -26,35 +29,32 @@ def equal_frequency_binning(df, fea_name, bin_count): :return: """ df[fea_name + '_f'] = pd.cut(df[fea_name], bin_count) + fea_count = df[[fea_name + '_f', target_name]].copy().groupby( + [fea_name + '_f', target_name]).size().unstack().fillna(0.0) + return fea_count -def auto_binning(df, feature_name, target_name, max_bin_count): +def auto_binning(df, fea_name, target_name, max_bin_count): """ 自动分箱 :param df: :param target_name: 目标变量名 - :param feature_name:特征变量名称 + :param fea_name:特征变量名称 :param max_bin_count:最大分箱数 :return: """ r = 0 - good = df[target_name].sum() - bad = df[target_name].count() - good while np.abs(r) < 1: - d1 = pd.DataFrame({'X': df[feature_name], + d1 = pd.DataFrame({'X': df[fea_name], 'Y': df[target_name], - 'Bucket': pd.qcut(df[feature_name], max_bin_count, duplicates='drop')}) - d2 = d1.groupby('Bucket', as_index=True) + fea_name + '_d': pd.qcut(df[fea_name], max_bin_count, duplicates='drop')}) + d2 = d1.groupby(fea_name + '_d', as_index=True) r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) max_bin_count = max_bin_count - 1 - woe = np.log((d2.mean().Y / (1 - d2.mean().Y)) / (good / bad)) - woe_dict = woe.to_dict() - woe_values = sorted(list(woe_dict.values())) - print(woe_values) - # 如果存在woe为inf情况,将其替换为不为inf的最大值加一 - df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x]) \ - .replace(np.inf, woe_values[-2] + 1) \ - .replace(-np.inf, woe_values[1] - 1) + + fea_count = df[[fea_name + '_d', target_name]].copy().groupby( + [fea_name + '_d', target_name]).size().unstack().fillna(0.0) + return fea_count def chi2(A): @@ -78,7 +78,7 @@ def chi2(A): return res -def chi_merge(df, fea_name, target_name, dis_count): +def chi_merge(df, fea_name, target_name, max_bin_count): """ chiMerge的主算法 :param df:数据,dataframe格式 @@ -88,7 +88,7 @@ def chi_merge(df, fea_name, target_name, dis_count): :return: 分割点 """ fea_count = df[[fea_name, target_name]].copy().groupby([fea_name, target_name]).size().unstack().fillna(0.0) - while fea_count.shape[0] > dis_count: + while fea_count.shape[0] > max_bin_count: chi_list = [] for i in range(fea_count.shape[0] - 1): chi_value = chi2(fea_count.iloc[i:i + 2].values) diff --git a/woe.py b/woe.py index 7bb40c0..5e34ba9 100644 --- a/woe.py +++ b/woe.py @@ -1,7 +1,6 @@ # -*- coding:utf-8 -*- import pandas as pd -from math import log import numpy as np import math from scipy import stats From ccb2c348d5c97d031ad645670875f3877fe3197d Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Fri, 29 Jun 2018 18:59:09 +0800 Subject: [PATCH 39/49] add WOE --- binning.py | 231 +++++++++++++++++++++++++++-------------------------- main.py | 6 +- 2 files changed, 123 insertions(+), 114 deletions(-) diff --git a/binning.py b/binning.py index f4d01d0..211dc2a 100644 --- a/binning.py +++ b/binning.py @@ -6,115 +6,122 @@ from scipy import stats -def equal_distance_binning(df, fea_name, target_name, bin_count): - """ - 等距分箱 - :param df: - :param fea_name: - :param bin_count - :return: - """ - df[fea_name + '_d'] = pd.cut(df[fea_name], bin_count) - fea_count = df[[fea_name + '_d', target_name]].copy().groupby( - [fea_name + '_d', target_name]).size().unstack().fillna(0.0) - return fea_count - - -def equal_frequency_binning(df, fea_name, target_name, bin_count): - """ - 等频分箱 - :param df: - :param fea_name: - :param bin_count - :return: - """ - df[fea_name + '_f'] = pd.cut(df[fea_name], bin_count) - fea_count = df[[fea_name + '_f', target_name]].copy().groupby( - [fea_name + '_f', target_name]).size().unstack().fillna(0.0) - return fea_count - - -def auto_binning(df, fea_name, target_name, max_bin_count): - """ - 自动分箱 - :param df: - :param target_name: 目标变量名 - :param fea_name:特征变量名称 - :param max_bin_count:最大分箱数 - :return: - """ - r = 0 - while np.abs(r) < 1: - d1 = pd.DataFrame({'X': df[fea_name], - 'Y': df[target_name], - fea_name + '_d': pd.qcut(df[fea_name], max_bin_count, duplicates='drop')}) - d2 = d1.groupby(fea_name + '_d', as_index=True) - r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) - max_bin_count = max_bin_count - 1 - - fea_count = df[[fea_name + '_d', target_name]].copy().groupby( - [fea_name + '_d', target_name]).size().unstack().fillna(0.0) - return fea_count - - -def chi2(A): - """ - 计算卡方值 - :param A:需要计算卡方的两行数据 - :return: 卡方值 - """ - m, k = A.shape # 行数 列数 - - R = A.sum(axis=1) # 行求和结果 - C = A.sum(axis=0) # 列求和结果 - N = A.sum() # 总和 - - res = 0 - for i in range(m): - for j in range(k): - Eij = 1.0 * R[i] * C[j] / N - if Eij != 0: - res = 1.0 * res + (A[i][j] - Eij) ** 2 / Eij - return res - - -def chi_merge(df, fea_name, target_name, max_bin_count): - """ - chiMerge的主算法 - :param df:数据,dataframe格式 - :param fea_name:需要进行分段的特征名称 - :param target_name:目标变量名称 - :param dis_count:最大分组数 - :return: 分割点 - """ - fea_count = df[[fea_name, target_name]].copy().groupby([fea_name, target_name]).size().unstack().fillna(0.0) - while fea_count.shape[0] > max_bin_count: - chi_list = [] - for i in range(fea_count.shape[0] - 1): - chi_value = chi2(fea_count.iloc[i:i + 2].values) - chi_list.append([fea_count.index[i], chi_value]) - chi_min_index = np.argmin(np.array(chi_list)[:, 1]) - if chi_min_index == len(chi_list) - 1: - current_fea = chi_list[chi_min_index][0] - fea_count.loc[current_fea] = fea_count.loc[current_fea:].sum(axis=0) - fea_count = fea_count.loc[:current_fea].copy() - else: - current_fea = chi_list[chi_min_index][0] - next_fea = chi_list[chi_min_index + 1][0] - fea_count.loc[current_fea] = fea_count.loc[current_fea] + fea_count.loc[next_fea] - fea_count.drop([next_fea], inplace=True) - chi_list.remove(chi_list[chi_min_index + 1]) - return fea_count - - -def discrete(path): - df = pd.read_csv(path) - target_name = df.columns[-1] - fea_names = df.columns[0:-1] - dis_count = 2 - for f in fea_names: - chi_merge(df, f, target_name, dis_count) - - -if __name__ == '__main__': - discrete('iris.csv') +class Bin: + def __init__(self, df, target_name, bin_count): + self.df = df + self.target_name = target_name + self.bin_count = bin_count + + def equal_distance_binning(self, fea_name): + """ + 等距分箱 + :param df: + :param fea_name: + :param target_name: + :param bin_count: + :return: + """ + + self.df[fea_name + '_d'] = pd.cut(self.df[fea_name], self.bin_count) + fea_count = self.df[[fea_name + '_d', self.target_name]].copy().groupby( + [fea_name + '_d', self.target_name]).size().unstack().fillna(0.0) + return fea_count + + def equal_frequency_binning(self, fea_name): + """ + 等频分箱 + :param df: + :param fea_name: + :param target_name: + :param bin_count: + :return: + """ + self.df[fea_name + '_f'] = pd.cut(self.df[fea_name], self.bin_count) + fea_count = self.df[[fea_name + '_f', self.target_name]].copy().groupby( + [fea_name + '_f', self.target_name]).size().unstack().fillna(0.0) + return fea_count + + def auto_binning(self, fea_name): + """ + 自动分箱 + :param df: + :param target_name: 目标变量名 + :param fea_name:特征变量名称 + :param max_bin_count:最大分箱数 + :return: + """ + r = 0 + while np.abs(r) < 1: + d1 = pd.DataFrame({'X': self.df[fea_name], + 'Y': self.df[self.target_name], + fea_name + '_d': pd.qcut(self.df[fea_name], self.bin_count, + duplicates='drop')}) + d2 = d1.groupby(fea_name + '_d', as_index=True) + r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) + max_bin_count = max_bin_count - 1 + + fea_count = self.df[[fea_name + '_d', self.target_name]].copy().groupby( + [fea_name + '_d', self.target_name]).size().unstack().fillna(0.0) + return fea_count + + def chi2(self, A): + """ + 计算卡方值 + :param A:需要计算卡方的两行数据 + :return: 卡方值 + """ + m, k = A.shape # 行数 列数 + + R = A.sum(axis=1) # 行求和结果 + C = A.sum(axis=0) # 列求和结果 + N = A.sum() # 总和 + + res = 0 + for i in range(m): + for j in range(k): + Eij = 1.0 * R[i] * C[j] / N + if Eij != 0: + res = 1.0 * res + (A[i][j] - Eij) ** 2 / Eij + return res + + def chi_merge(self, fea_name): + """ + chiMerge的主算法 + :param df:数据,dataframe格式 + :param fea_name:需要进行分段的特征名称 + :param target_name:目标变量名称 + :param dis_count:最大分组数 + :return: 分割点 + """ + fea_count = self.df[[fea_name, self.target_name]].copy().groupby( + [fea_name, self.target_name]).size().unstack().fillna(0.0) + while fea_count.shape[0] > self.bin_count: + chi_list = [] + for i in range(fea_count.shape[0] - 1): + chi_value = self.chi2(fea_count.iloc[i:i + 2].values) + chi_list.append([fea_count.index[i], chi_value]) + chi_min_index = np.argmin(np.array(chi_list)[:, 1]) + if chi_min_index == len(chi_list) - 1: + current_fea = chi_list[chi_min_index][0] + fea_count.loc[current_fea] = fea_count.loc[current_fea:].sum(axis=0) + fea_count = fea_count.loc[:current_fea].copy() + else: + current_fea = chi_list[chi_min_index][0] + next_fea = chi_list[chi_min_index + 1][0] + fea_count.loc[current_fea] = fea_count.loc[current_fea] + fea_count.loc[next_fea] + fea_count.drop([next_fea], inplace=True) + chi_list.remove(chi_list[chi_min_index + 1]) + return fea_count + +# +# def discrete(path): +# df = pd.read_csv(path) +# target_name = df.columns[-1] +# fea_names = df.columns[0:-1] +# dis_count = 2 +# for f in fea_names: +# chi_merge(df, f, target_name, dis_count) +# +# +# if __name__ == '__main__': +# discrete('iris.csv') diff --git a/main.py b/main.py index 69c12b9..b68dfaf 100644 --- a/main.py +++ b/main.py @@ -100,6 +100,8 @@ def split_data(data_to_split): # print("au值: " + str(auc)) # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) - bins = binning.chi_merge(data, 'SepalLength', 'Label', 5) - bin_woe = woe.add_woe_col(data, bins) + bin = binning.Bin(data, 'Label', 5) + for n in data.columns.values[:-1]: + bins = bin.chi_merge(n) + woe.add_woe_col(data, bins) print(data) From 1910b1ccbbd948e5f2a639fe062e71df547a1915 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Fri, 29 Jun 2018 19:47:28 +0800 Subject: [PATCH 40/49] add WOE --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index b68dfaf..e765317 100644 --- a/main.py +++ b/main.py @@ -104,4 +104,4 @@ def split_data(data_to_split): for n in data.columns.values[:-1]: bins = bin.chi_merge(n) woe.add_woe_col(data, bins) - print(data) + print(data) \ No newline at end of file From 7071cf7839cbc147a4f441539ecf7d8122ad9d7e Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Mon, 2 Jul 2018 14:32:01 +0800 Subject: [PATCH 41/49] add WOE --- feature_selection.py | 36 ++++++++++++++++++++++++++++++++++++ main.py | 12 +++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 feature_selection.py diff --git a/feature_selection.py b/feature_selection.py new file mode 100644 index 0000000..f7e4753 --- /dev/null +++ b/feature_selection.py @@ -0,0 +1,36 @@ +# -*- coding:utf-8 -*- +__author__ = 'xujia' + +from sklearn.feature_selection import SelectKBest +from sklearn.feature_selection import chi2 +from sklearn.feature_selection import RFE +from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.feature_selection import SelectFromModel + + +def chi2_select(X, y, number): + """ + 根据卡方筛选变量, + :param X: + :param y: + :param number: + :return: + """ + X_new = SelectKBest(chi2, k=number).fit(X, y) + print(X_new.scores_) + return X_new + + +def fea_select(X, y): + clf = DecisionTreeClassifier() + clf = clf.fit(X, y) + print(clf.feature_importances_) + model = SelectFromModel(clf, prefit=True) + X_new = model.transform(X) + print(X_new) + + +from minepy import MINE +m = MINE() \ No newline at end of file diff --git a/main.py b/main.py index e765317..2932081 100644 --- a/main.py +++ b/main.py @@ -7,11 +7,13 @@ import evaluate import modeling import woe +import feature_selection import math from pandas import Interval from numpy import inf + def file_info(file_path): """ 获取文件信息 @@ -77,6 +79,9 @@ def split_data(data_to_split): return splited_data + + + if __name__ == '__main__': # path=input('Please input the file path: ') path = 'iris.csv' @@ -104,4 +109,9 @@ def split_data(data_to_split): for n in data.columns.values[:-1]: bins = bin.chi_merge(n) woe.add_woe_col(data, bins) - print(data) \ No newline at end of file + print(data) + + # select_func = chi2_select(data[['SepalLength', 'SepalWidth']], data['Label'], 1) + # print(select_func.transform(data[['SepalLength', 'SepalWidth']])) + + feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label']) From 6b5f8a4158c57a6a289dab8b7d173f59a5d1f79c Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Mon, 2 Jul 2018 14:58:30 +0800 Subject: [PATCH 42/49] add mutural information --- feature_selection.py | 30 ++++++++++++++++++++++++++++-- main.py | 5 +---- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/feature_selection.py b/feature_selection.py index f7e4753..11570ad 100644 --- a/feature_selection.py +++ b/feature_selection.py @@ -8,6 +8,7 @@ from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import SelectFromModel +from minepy import MINE def chi2_select(X, y, number): @@ -24,6 +25,12 @@ def chi2_select(X, y, number): def fea_select(X, y): + """ + 使用决策树筛选变量 + :param X: + :param y: + :return: + """ clf = DecisionTreeClassifier() clf = clf.fit(X, y) print(clf.feature_importances_) @@ -32,5 +39,24 @@ def fea_select(X, y): print(X_new) -from minepy import MINE -m = MINE() \ No newline at end of file +def mi(X, y): + """ + 计算互信息 + :param X: + :param y: + :return: + """ + mi_dict = {} + m = MINE() + try: + if X.shape[1] > 1: + for f in X.columns: + m.compute_score(X[f], y) + mi_dict[f] = m.mic() + print(mi_dict) + return mi_dict + except: + m.compute_score(X, y) + mi_dict[X.name] = m.mic() + print(mi_dict) + return mi_dict diff --git a/main.py b/main.py index 2932081..7ac4aef 100644 --- a/main.py +++ b/main.py @@ -13,7 +13,6 @@ from numpy import inf - def file_info(file_path): """ 获取文件信息 @@ -79,9 +78,6 @@ def split_data(data_to_split): return splited_data - - - if __name__ == '__main__': # path=input('Please input the file path: ') path = 'iris.csv' @@ -115,3 +111,4 @@ def split_data(data_to_split): # print(select_func.transform(data[['SepalLength', 'SepalWidth']])) feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label']) + feature_selection.mi(data['SepalWidth_woe'], data['Label']) From c3de04f68a5ff624363c955f890f80c4c741184b Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Tue, 3 Jul 2018 10:44:28 +0800 Subject: [PATCH 43/49] add mutural information --- ARUtil.py | 476 ++++++++++++++++++++++--------------------- binning.py | 23 +-- evaluate.py | 11 + feature_selection.py | 1 + main.py | 13 +- woe.py | 7 +- 6 files changed, 280 insertions(+), 251 deletions(-) diff --git a/ARUtil.py b/ARUtil.py index 1cf4a9c..0281d6a 100755 --- a/ARUtil.py +++ b/ARUtil.py @@ -1,229 +1,247 @@ -# encoding:utf-8 -import pandas as pd -import numpy as np -import logging -from scipy import stats - - -class ARFilter(object): - def __init__(self, threshold=0.05, dest_var='y'): - self.threshold = threshold - self.dest_var = dest_var - logging.basicConfig() - self.logger = logging.getLogger("default") - self.logger.setLevel(level=logging.INFO) - - def info_value(self): - """ - 信息熵 - :return: - """ - pass - - def chi_square(self): - """ - 卡方 - :return: - """ - pass - - def train_cal_input(self, excel_name='input.csv'): - """ - AR值筛选 - 输入:宽表【变量1、变量2、目标变量】、筛选下限(默认0.05)、目标变量名称(默认y) - 输出:筛选后的变量列表【变量名称,AR值】(按照AR值降序排列) - 计算方式:使用单个变量与目标变量进行逻辑回归运算,返回模型的K-S值即为该变量的AR值。 - """ - from sklearn.linear_model import LogisticRegression - from sklearn.metrics import roc_curve - data = pd.read_csv(excel_name) - # 创建逻辑回归模型 - logit_model = LogisticRegression() - final_list = [] - for col in data.columns.values[0:-1]: - if col != self.dest_var: - # 特征变量值 - X = data[col].values.reshape(-1, 1) - # 拆分数据集为训练集与测试集 - x_train = X[:-20] - x_test = X[-20:] - # 目标变量值 - y = data[self.dest_var].values.reshape(-1, 1) - y_train = y[:-20] - y_test = y[-20:] - # 数据拟合 - logit_model.fit(x_train, y_train) - # 每一列与y列做预测 - # prob = logit_model.predict_proba(data[col].values.reshape(-1, 1)) - prob = logit_model.predict_proba(x_test) - # prob[:, 1] 预测结果为两列,分别为0值可能性与1值可能性,此处取1值可能性 - # fpr, tpr, thresholds = roc_curve(data[self.dest_var].values.reshape(-1, 1), prob[:, 1]) - fpr, tpr, thresholds = roc_curve(y_test, prob[:, 1]) - from scipy import stats - # AR = float(stats.ks_2samp(y_test, prob[:, 1].reshape(-1, 1)).statistic) - # AR = float(stats.ks_2samp(y_test.ravel(), prob[:, 1]).statistic) - # testDF = pd.DataFrame() - # testDF['predict_proba'] = prob[:,1] - # testDF['label'] = np.array(y_test) - # print self.cal_ks(testDF) - # print str(AR) + "-" * 30 - ks = abs(fpr - tpr).max() - # print str(ks) + "*" * 30 - # print ks - if ks > self.threshold: - final_list.append({'varName': col, "AR": ks}) - else: - self.logger.info('列:' + col + '的AR值为:' + str(ks) + ", 低于阈值:" + str(self.threshold)) - # AR值排序 - final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True) - self.logger.info(pd.DataFrame(final_list)) - pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False) - - def cal_ks(self, data): - """手动计算KS值""" - # 对样本数据排序,根据预测值升序排序 - sorted_list = data.sort_values(['predict_proba'], ascending=True) - total_good_count = sorted_list['label'].sum() * 1.0 - total_bad_count = (sorted_list.shape[0] - total_good_count) * 1.0 - max_ks = 0.0 - good_count = 0.0 - bad_count = 0.0 - for index, row in sorted_list.iterrows(): - if row['label'] == 0: - bad_count += 1.0 - else: - good_count += 1.0 - val = abs(bad_count / total_bad_count - good_count / total_good_count) - max_ks = max(max_ks, val) - return max_ks - - def cal_ar(self, excel_name='test.xlsx'): - excel = pd.read_excel(excel_name) - if excel.columns.size < 2: - self.logger.error("未找到Excel数据源!") - return - dest_value = excel[self.dest_var] - final_list = [] - # result_frame = pd.DataFrame(columns=['varName', 'AR']) - for col in excel.columns: - if col != self.dest_var: - AR = float(stats.ks_2samp(excel[col], dest_value).statistic) - final_list - # self.logger.info(final_list) - # final_list.append({'AR': 1.0, 'colName': u'var3'}) - # final_list.append({'AR': 0.8, 'colName': u'var4'}) - final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True) - # self.logger.info("final result:" + str(final_list)) - # self.logger.info("123") - self.logger.info(pd.DataFrame(final_list)) - pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False) - - def fill_empty_value(self, col_name, data, default_value=0): - """ - 缺失值填充 - 输入:宽表【变量1、变量2、目标变量】,变量名称,缺失值填充值(默认0) - 计算方式:直接将指定变量中的缺失值用参数中的填充值进行填充 - 输出:填充后的宽表,变量缺失率 - """ - # data = pd.read_excel(file_name) - if col_name not in data.columns.values: - self.logger.error("输入宽表中不存在指定变量") - return - else: - empty_count = data[col_name].shape[0] - data[col_name].count() - if empty_count > 0: - self.logger.info('当前共' + str(data.shape[0]) + '个变量值,其中缺失值个数为' + str(empty_count)) - # 替换空串为NAN - data[col_name] = data[col_name].replace(' ', np.nan).fillna(value=default_value) - self.logger.info('填补后,缺失值个数为' + str(data[col_name].shape[0] - data[col_name].count())) - # data.to_excel('result.xls', index=False) - return data - else: - self.logger.info('当前不存在缺失值') - - def del_empty_value(self, data, empty_rate_threshold=0.5): - """ - 缺失值剔除 - 输入:宽表【变量1、变量2、目标变量】,缺失率(默认0.5) - 计算方式:计算宽表中各个变量的缺失率,并剔除缺失率超过0.5的变量 - 输出:处理后宽表 - """ - for col in data.columns.values: - if col == 'y': - continue - empty_ratio = (data[col].shape[0] - data[col].count()) / data[col].shape[0] - if empty_ratio >= empty_rate_threshold: - self.logger.info("变量:" + col + "缺失率为" + str(empty_ratio) + ",高于阈值:" + str(empty_rate_threshold)) - data = data.drop(col, axis=1) - return data - # data.to_excel(file_name.split(".")[0] + "_new." + file_name.split(".")[1], index=False) - - def console_input(self, prompt="", if_value=[], else_value=[], if_rtn="", else_rtn=""): - rtn = input(prompt) - if rtn.strip() in if_value: - return if_rtn - elif rtn.strip() in else_value or len(else_value) == 0: - return else_rtn - else: - raise IOError("未匹配到条件") - - def file_info(self, path): - """ - 获取文件信息 - :param path: 文件路径 - :return: {字段名称:[字段类型,数据量,空值个数]} - """ - info_dict = {} - data = pd.read_csv(path) - for c in data.columns: - ctype = data[c].dtype - nc = data[c].size - data[c].notnull().sum() - info_dict[c] = [ctype, data[c].size, nc] # 字段类型,数据量,空值个数 - return info_dict, data - - def is_contain_empty_value(self, file_dict): - empty_col_list = [] - for item in file_dict: - self.logger.info(file_dict[item]) - if int(file_dict[item][2]) > 0: - self.logger.info("列" + item + "空值个数:" + str(file_dict[item][2])) - empty_col_list.append(item) - if len(empty_col_list) > 0: - return True, empty_col_list - else: - return False, [] - - def main(self): - file_path = input("请输入待处理的文件名路径:") - import os.path - if os.path.isfile(file_path): - file_dict, data = self.file_info(file_path) - is_contain_empty_value, empty_col_list = self.is_contain_empty_value(file_dict) - if is_contain_empty_value: - self.logger.info("当前存在缺失值") - is_fill_empty = self.console_input(prompt="是否需要填充数据?1:是,其他值:否", if_value=["1"], else_value=[], - if_rtn=True, else_rtn=False) - if is_fill_empty: - for col in empty_col_list: - fill_value = input("请输入列" + col + "待填充的数据:") - self.logger.info("列" + col + "将填充数据:" + fill_value) - data = self.fill_empty_value(col_name=col, data=data, default_value=fill_value) - print(data) - else: - self.logger.info("不填充数据,程序退出") - else: - self.logger.info("当前不存在缺失数据") - else: - self.logger.error("指定的文件路径不存在") - - -def run(): - ar = ARFilter() - # ar.train_cal_input() - # ar.fill_empty_value(col_name='emptyCol', file_name='empty.xls', default_value=0) - # ar.del_empty_value(file_name="empty_ratio.xls") - ar.main() - - -if __name__ == "__main__": - run() +# -*- coding:utf-8 -*- + +from sklearn import metrics +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split + + +# class ARFilter(object): +# def __init__(self, threshold=0.05, dest_var='y'): +# self.threshold = threshold +# self.dest_var = dest_var +# logging.basicConfig() +# self.logger = logging.getLogger("default") +# self.logger.setLevel(level=logging.INFO) +# +# def info_value(self): +# """ +# 信息熵 +# :return: +# """ +# pass +# +# def chi_square(self): +# """ +# 卡方 +# :return: +# """ +# pass +# +# def train_cal_input(self, excel_name='input.csv'): +# """ +# AR值筛选 +# 输入:宽表【变量1、变量2、目标变量】、筛选下限(默认0.05)、目标变量名称(默认y) +# 输出:筛选后的变量列表【变量名称,AR值】(按照AR值降序排列) +# 计算方式:使用单个变量与目标变量进行逻辑回归运算,返回模型的K-S值即为该变量的AR值。 +# """ +# from sklearn.linear_model import LogisticRegression +# from sklearn.metrics import roc_curve +# data = pd.read_csv(excel_name) +# # 创建逻辑回归模型 +# logit_model = LogisticRegression() +# final_list = [] +# for col in data.columns.values[0:-1]: +# if col != self.dest_var: +# # 特征变量值 +# X = data[col].values.reshape(-1, 1) +# # 拆分数据集为训练集与测试集 +# x_train = X[:-20] +# x_test = X[-20:] +# # 目标变量值 +# y = data[self.dest_var].values.reshape(-1, 1) +# y_train = y[:-20] +# y_test = y[-20:] +# # 数据拟合 +# logit_model.fit(x_train, y_train) +# # 每一列与y列做预测 +# # prob = logit_model.predict_proba(data[col].values.reshape(-1, 1)) +# prob = logit_model.predict_proba(x_test) +# # prob[:, 1] 预测结果为两列,分别为0值可能性与1值可能性,此处取1值可能性 +# # fpr, tpr, thresholds = roc_curve(data[self.dest_var].values.reshape(-1, 1), prob[:, 1]) +# fpr, tpr, thresholds = roc_curve(y_test, prob[:, 1]) +# from scipy import stats +# # AR = float(stats.ks_2samp(y_test, prob[:, 1].reshape(-1, 1)).statistic) +# # AR = float(stats.ks_2samp(y_test.ravel(), prob[:, 1]).statistic) +# # testDF = pd.DataFrame() +# # testDF['predict_proba'] = prob[:,1] +# # testDF['label'] = np.array(y_test) +# # print self.cal_ks(testDF) +# # print str(AR) + "-" * 30 +# ks = abs(fpr - tpr).max() +# # print str(ks) + "*" * 30 +# # print ks +# if ks > self.threshold: +# final_list.append({'varName': col, "AR": ks}) +# else: +# self.logger.info('列:' + col + '的AR值为:' + str(ks) + ", 低于阈值:" + str(self.threshold)) +# # AR值排序 +# final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True) +# self.logger.info(pd.DataFrame(final_list)) +# pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False) +# +# def cal_ks(self, data): +# """ +# 手动计算KS值 +# :param data: +# :return: +# """ +# # 对样本数据排序,根据预测值升序排序 +# sorted_list = data.sort_values(['predict_proba'], ascending=True) +# total_good_count = sorted_list['label'].sum() * 1.0 +# total_bad_count = (sorted_list.shape[0] - total_good_count) * 1.0 +# max_ks = 0.0 +# good_count = 0.0 +# bad_count = 0.0 +# for index, row in sorted_list.iterrows(): +# if row['label'] == 0: +# bad_count += 1.0 +# else: +# good_count += 1.0 +# val = abs(bad_count / total_bad_count - good_count / total_good_count) +# max_ks = max(max_ks, val) +# return max_ks +# +# def cal_ar(self, excel_name='test.xlsx'): +# excel = pd.read_excel(excel_name) +# if excel.columns.size < 2: +# self.logger.error("未找到Excel数据源!") +# return +# dest_value = excel[self.dest_var] +# final_list = [] +# for col in excel.columns: +# if col != self.dest_var: +# AR = float(stats.ks_2samp(excel[col], dest_value).statistic) +# final_list +# # self.logger.info(final_list) +# # final_list.append({'AR': 1.0, 'colName': u'var3'}) +# # final_list.append({'AR': 0.8, 'colName': u'var4'}) +# final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True) +# # self.logger.info("final result:" + str(final_list)) +# # self.logger.info("123") +# self.logger.info(pd.DataFrame(final_list)) +# pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False) +# +# def fill_empty_value(self, col_name, data, default_value=0): +# """ +# 缺失值填充 +# 输入:宽表【变量1、变量2、目标变量】,变量名称,缺失值填充值(默认0) +# 计算方式:直接将指定变量中的缺失值用参数中的填充值进行填充 +# 输出:填充后的宽表,变量缺失率 +# """ +# # data = pd.read_excel(file_name) +# if col_name not in data.columns.values: +# self.logger.error("输入宽表中不存在指定变量") +# return +# else: +# empty_count = data[col_name].shape[0] - data[col_name].count() +# if empty_count > 0: +# self.logger.info('当前共' + str(data.shape[0]) + '个变量值,其中缺失值个数为' + str(empty_count)) +# # 替换空串为NAN +# data[col_name] = data[col_name].replace(' ', np.nan).fillna(value=default_value) +# self.logger.info('填补后,缺失值个数为' + str(data[col_name].shape[0] - data[col_name].count())) +# # data.to_excel('result.xls', index=False) +# return data +# else: +# self.logger.info('当前不存在缺失值') +# +# def del_empty_value(self, data, empty_rate_threshold=0.5): +# """ +# 缺失值剔除 +# 输入:宽表【变量1、变量2、目标变量】,缺失率(默认0.5) +# 计算方式:计算宽表中各个变量的缺失率,并剔除缺失率超过0.5的变量 +# 输出:处理后宽表 +# """ +# for col in data.columns.values: +# if col == 'y': +# continue +# empty_ratio = (data[col].shape[0] - data[col].count()) / data[col].shape[0] +# if empty_ratio >= empty_rate_threshold: +# self.logger.info("变量:" + col + "缺失率为" + str(empty_ratio) + ",高于阈值:" + str(empty_rate_threshold)) +# data = data.drop(col, axis=1) +# return data +# # data.to_excel(file_name.split(".")[0] + "_new." + file_name.split(".")[1], index=False) +# +# def console_input(self, prompt="", if_value=[], else_value=[], if_rtn="", else_rtn=""): +# rtn = input(prompt) +# if rtn.strip() in if_value: +# return if_rtn +# elif rtn.strip() in else_value or len(else_value) == 0: +# return else_rtn +# else: +# raise IOError("未匹配到条件") +# +# def file_info(self, path): +# """ +# 获取文件信息 +# :param path: 文件路径 +# :return: {字段名称:[字段类型,数据量,空值个数]} +# """ +# info_dict = {} +# data = pd.read_csv(path) +# for c in data.columns: +# ctype = data[c].dtype +# nc = data[c].size - data[c].notnull().sum() +# info_dict[c] = [ctype, data[c].size, nc] # 字段类型,数据量,空值个数 +# return info_dict, data +# +# def is_contain_empty_value(self, file_dict): +# empty_col_list = [] +# for item in file_dict: +# self.logger.info(file_dict[item]) +# if int(file_dict[item][2]) > 0: +# self.logger.info("列" + item + "空值个数:" + str(file_dict[item][2])) +# empty_col_list.append(item) +# if len(empty_col_list) > 0: +# return True, empty_col_list +# else: +# return False, [] +# +# def main(self): +# file_path = input("请输入待处理的文件名路径:") +# import os.path +# if os.path.isfile(file_path): +# file_dict, data = self.file_info(file_path) +# is_contain_empty_value, empty_col_list = self.is_contain_empty_value(file_dict) +# if is_contain_empty_value: +# self.logger.info("当前存在缺失值") +# is_fill_empty = self.console_input(prompt="是否需要填充数据?1:是,其他值:否", if_value=["1"], else_value=[], +# if_rtn=True, else_rtn=False) +# if is_fill_empty: +# for col in empty_col_list: +# fill_value = input("请输入列" + col + "待填充的数据:") +# self.logger.info("列" + col + "将填充数据:" + fill_value) +# data = self.fill_empty_value(col_name=col, data=data, default_value=fill_value) +# print(data) +# else: +# self.logger.info("不填充数据,程序退出") +# else: +# self.logger.info("当前不存在缺失数据") +# else: +# self.logger.error("指定的文件路径不存在") + + +def cal_ar(X, y): + """ + 计算AR值 + :param X: + :param y: + :return: + """ + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) + lr = LogisticRegression() + lr.fit(X_train.values.reshape(-1, 1), y_train) + pred = lr.predict_proba(X_test.values.reshape(-1, 1)) + ar = 2.0 * metrics.roc_auc_score(y_test, pred[:, 1]) - 1.0 + print('ar值:%s' % str(ar)) + return ar + +# def run(): +# ar = ARFilter() +# ar.train_cal_input() +# ar.fill_empty_value(col_name='emptyCol', file_name='empty.xls', default_value=0) +# ar.del_empty_value(file_name="empty_ratio.xls") +# ar.main() + + +# if __name__ == "__main__": +# run() diff --git a/binning.py b/binning.py index 211dc2a..ccd6cf2 100644 --- a/binning.py +++ b/binning.py @@ -15,53 +15,49 @@ def __init__(self, df, target_name, bin_count): def equal_distance_binning(self, fea_name): """ 等距分箱 - :param df: :param fea_name: - :param target_name: - :param bin_count: :return: """ self.df[fea_name + '_d'] = pd.cut(self.df[fea_name], self.bin_count) fea_count = self.df[[fea_name + '_d', self.target_name]].copy().groupby( [fea_name + '_d', self.target_name]).size().unstack().fillna(0.0) + fea_count.index = fea_count.index.map(lambda x: x.left) + fea_count.index.name = fea_name return fea_count def equal_frequency_binning(self, fea_name): """ 等频分箱 - :param df: :param fea_name: - :param target_name: - :param bin_count: :return: """ self.df[fea_name + '_f'] = pd.cut(self.df[fea_name], self.bin_count) fea_count = self.df[[fea_name + '_f', self.target_name]].copy().groupby( [fea_name + '_f', self.target_name]).size().unstack().fillna(0.0) + fea_count.index = fea_count.index.map(lambda x: x.left) + fea_count.index.name = fea_name return fea_count def auto_binning(self, fea_name): """ 自动分箱 - :param df: - :param target_name: 目标变量名 :param fea_name:特征变量名称 - :param max_bin_count:最大分箱数 :return: """ r = 0 while np.abs(r) < 1: d1 = pd.DataFrame({'X': self.df[fea_name], 'Y': self.df[self.target_name], - fea_name + '_d': pd.qcut(self.df[fea_name], self.bin_count, - duplicates='drop')}) + fea_name + '_d': pd.qcut(self.df[fea_name], self.bin_count, duplicates='drop')}) d2 = d1.groupby(fea_name + '_d', as_index=True) r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) - max_bin_count = max_bin_count - 1 + self.bin_count = self.bin_count - 1 fea_count = self.df[[fea_name + '_d', self.target_name]].copy().groupby( [fea_name + '_d', self.target_name]).size().unstack().fillna(0.0) + fea_count.index = fea_count.index.map(lambda x: x.left) + fea_count.index.name = fea_name return fea_count def chi2(self, A): @@ -87,10 +83,7 @@ def chi2(self, A): def chi_merge(self, fea_name): """ chiMerge的主算法 - :param df:数据,dataframe格式 :param fea_name:需要进行分段的特征名称 - :param target_name:目标变量名称 - :param dis_count:最大分组数 :return: 分割点 """ fea_count = self.df[[fea_name, self.target_name]].copy().groupby( diff --git a/evaluate.py b/evaluate.py index 9b96214..4bf7d01 100644 --- a/evaluate.py +++ b/evaluate.py @@ -32,3 +32,14 @@ def roc(model, test_data): plt.ylabel("True Positive Rate") plt.title("ROC Diagram") plt.show() + + +def correlation_coef(data): + """ + 计算相关系数 + :param data: + :return: + """ + correlation = data.corr() + print(correlation) + return correlation diff --git a/feature_selection.py b/feature_selection.py index 11570ad..ad116e0 100644 --- a/feature_selection.py +++ b/feature_selection.py @@ -37,6 +37,7 @@ def fea_select(X, y): model = SelectFromModel(clf, prefit=True) X_new = model.transform(X) print(X_new) + return X_new def mi(X, y): diff --git a/main.py b/main.py index 7ac4aef..26eb76d 100644 --- a/main.py +++ b/main.py @@ -7,6 +7,7 @@ import evaluate import modeling import woe +import ARUtil import feature_selection import math from pandas import Interval @@ -61,14 +62,13 @@ def change_type(df, fea_type_dict): df[fea_name] = df[fea_name].astype(target_type) -def split_data(data_to_split): +def split_data(data_to_split, ratio): """ 数据分割 :param data_to_split:带分割数据 + :param ratio:数据分割比例 :return: (数据集1,数据集2) """ - # ratio = float(input('请输入数据分割比例:')) - ratio = 0.8 data_count = data_to_split.shape[0] selected_count = int(data_count * ratio) if selected_count > 0: @@ -107,8 +107,9 @@ def split_data(data_to_split): woe.add_woe_col(data, bins) print(data) - # select_func = chi2_select(data[['SepalLength', 'SepalWidth']], data['Label'], 1) + # select_func = feature_selection.fea_select(data[['SepalLength', 'SepalWidth']], data['Label'], 1) # print(select_func.transform(data[['SepalLength', 'SepalWidth']])) - feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label']) - feature_selection.mi(data['SepalWidth_woe'], data['Label']) + # feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label']) + # feature_selection.mi(data['SepalWidth_woe'], data['Label']) + ar = ARUtil.cal_ar(data['SepalWidth_woe'], data['Label']) diff --git a/woe.py b/woe.py index 5e34ba9..c49b771 100644 --- a/woe.py +++ b/woe.py @@ -213,7 +213,12 @@ def add_woe_col(data, bins): bin_woe = dict(zip(interval_list, woe_list)) data[fea_name + '_bin'] = pd.cut(data[fea_name], bins=np.append(bins.index.values, [np.inf])).astype(str) data[fea_name + '_woe'] = data[fea_name + '_bin'].apply(lambda x: bin_woe[x]) - del data[fea_name + '_bin'] + if fea_name + '_bin' in data.columns.values: + del data[fea_name + '_bin'] + if fea_name + '_d' in data.columns.values: + del data[fea_name + '_d'] + if fea_name + '_f' in data.columns.values: + del data[fea_name + '_f'] # if __name__ == '__main__': # path=input('Please input the file path: ') From 6381dfb933007c59e9a37ba980dc4da348dbf68e Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Tue, 3 Jul 2018 11:06:49 +0800 Subject: [PATCH 44/49] add mutural information --- main.py | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/main.py b/main.py index 26eb76d..23c2d10 100644 --- a/main.py +++ b/main.py @@ -12,7 +12,7 @@ import math from pandas import Interval from numpy import inf - +from pprint import pprint def file_info(file_path): """ @@ -82,34 +82,32 @@ def split_data(data_to_split, ratio): # path=input('Please input the file path: ') path = 'iris.csv' fea_dict, data = file_info(path) + pprint(fea_dict) data = data.fillna(0.0) # change_type(data, fea_dict) # print(data.dtypes) - # print(t[0].shape) - # print(t[1].shape) - # binning.auto_binning(data, 'SepalLength','Label', 10) - # binning.auto_binning(data, 'PetalLength','Label', 10) - # binning.auto_binning(data, 'PetalWidth','Label', 10) - # train_data, test_data = split_data(data) - # model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') - # predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, - # 0.5, 100, 10) - # print(list(zip(test_data['Label'].values, predict_score))) - # - # auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) - # print("au值: " + str(auc)) - # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) - bin = binning.Bin(data, 'Label', 5) for n in data.columns.values[:-1]: bins = bin.chi_merge(n) woe.add_woe_col(data, bins) - print(data) + + # 单变量ar值计算 + # ar = ARUtil.cal_ar(data['SepalWidth_woe'], data['Label']) + + train_data, test_data = split_data(data,0.7) + model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') + predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5, 100, 10) + pprint(list(zip(test_data['Label'].values, predict_score))) + auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) + print("au值: " + str(auc)) + evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) + + # select_func = feature_selection.fea_select(data[['SepalLength', 'SepalWidth']], data['Label'], 1) # print(select_func.transform(data[['SepalLength', 'SepalWidth']])) # feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label']) # feature_selection.mi(data['SepalWidth_woe'], data['Label']) - ar = ARUtil.cal_ar(data['SepalWidth_woe'], data['Label']) + From 581948598aeba4da4ef7726e381a11e905ac0c4e Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Tue, 3 Jul 2018 11:17:11 +0800 Subject: [PATCH 45/49] add mutural information --- main.py | 45 +++++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/main.py b/main.py index 23c2d10..a60046d 100644 --- a/main.py +++ b/main.py @@ -14,6 +14,7 @@ from numpy import inf from pprint import pprint + def file_info(file_path): """ 获取文件信息 @@ -46,20 +47,26 @@ def change_type(df, fea_type_dict): print('字段名称对应数字为:') for (n, m) in feature_dict.items(): print(n, m) - fea_name = int(input('请输入如需要更改数据类型的字段对应的数字:')) - if fea_name not in feature_dict.keys(): - fea_name = int(input('输入字段名称错误,请重新输入:')) - if fea_name not in fea_dict.keys(): - pass - fea_name = feature_dict[fea_name] - - target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):')) - if target_type not in type_dict.keys(): + if_change = input('是否需要修改字段类型?(y/n)') + if if_change == 'y': + fea_name = int(input('请输入需要更改数据类型的字段对应的数字:')) + if fea_name not in feature_dict.keys(): + fea_name = int(input('输入字段名称错误,请重新输入:')) + if fea_name not in fea_dict.keys(): + pass + fea_name = feature_dict[fea_name] + target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):')) if target_type not in type_dict.keys(): - pass - target_type = type_dict[target_type] - df[fea_name] = df[fea_name].astype(target_type) + target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):')) + if target_type not in type_dict.keys(): + pass + target_type = type_dict[target_type] + df[fea_name] = df[fea_name].astype(target_type) + elif if_change == 'n': + pass + else: + pass def split_data(data_to_split, ratio): @@ -82,10 +89,12 @@ def split_data(data_to_split, ratio): # path=input('Please input the file path: ') path = 'iris.csv' fea_dict, data = file_info(path) + print('字段名', '数据类型', '数据总量', '缺失值个数') pprint(fea_dict) data = data.fillna(0.0) - # change_type(data, fea_dict) - # print(data.dtypes) + + change_type(data, fea_dict) + print(data.dtypes) bin = binning.Bin(data, 'Label', 5) for n in data.columns.values[:-1]: @@ -95,12 +104,13 @@ def split_data(data_to_split, ratio): # 单变量ar值计算 # ar = ARUtil.cal_ar(data['SepalWidth_woe'], data['Label']) - train_data, test_data = split_data(data,0.7) + train_data, test_data = split_data(data, 0.7) model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') - predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5, 100, 10) + predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, + 0.5, 100, 10) pprint(list(zip(test_data['Label'].values, predict_score))) auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) - print("au值: " + str(auc)) + print("auc值: " + str(auc)) evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) @@ -110,4 +120,3 @@ def split_data(data_to_split, ratio): # feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label']) # feature_selection.mi(data['SepalWidth_woe'], data['Label']) - From a6caf67f975be1348b568d7c43786c14de144e05 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Wed, 4 Jul 2018 13:27:20 +0800 Subject: [PATCH 46/49] add mutural information --- evaluate.py | 14 +++----------- ARUtil.py => feature_index.py | 10 ++++++++++ feature_selection.py | 7 ++++--- main.py | 2 +- 4 files changed, 18 insertions(+), 15 deletions(-) rename ARUtil.py => feature_index.py (98%) diff --git a/evaluate.py b/evaluate.py index 4bf7d01..e719ea1 100644 --- a/evaluate.py +++ b/evaluate.py @@ -6,7 +6,7 @@ def auc(model, test_data): """ - + AUC :param model:模型 :param test_data:测试数据,dataframe格式,第一列至倒数第二列为特征字段,最后一列为目标字段 :return:auc值 @@ -17,7 +17,7 @@ def auc(model, test_data): def roc(model, test_data): """ - + ROC :param model:模型 :param test_data:测试数据,dataframe格式,第一列至倒数第二列为特征字段,最后一列为目标字段 :return:roc曲线 @@ -34,12 +34,4 @@ def roc(model, test_data): plt.show() -def correlation_coef(data): - """ - 计算相关系数 - :param data: - :return: - """ - correlation = data.corr() - print(correlation) - return correlation + diff --git a/ARUtil.py b/feature_index.py similarity index 98% rename from ARUtil.py rename to feature_index.py index 0281d6a..0868e61 100755 --- a/ARUtil.py +++ b/feature_index.py @@ -235,6 +235,16 @@ def cal_ar(X, y): print('ar值:%s' % str(ar)) return ar +def correlation_coef(data): + """ + 计算相关系数 + :param data: + :return: + """ + correlation = data.corr() + print(correlation) + return correlation + # def run(): # ar = ARFilter() # ar.train_cal_input() diff --git a/feature_selection.py b/feature_selection.py index ad116e0..6348262 100644 --- a/feature_selection.py +++ b/feature_selection.py @@ -3,13 +3,14 @@ from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 -from sklearn.feature_selection import RFE -from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier from sklearn.tree import DecisionTreeClassifier -from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import SelectFromModel from minepy import MINE +from sklearn.feature_selection import RFE +from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier +from sklearn.linear_model import LogisticRegression + def chi2_select(X, y, number): """ diff --git a/main.py b/main.py index a60046d..123e722 100644 --- a/main.py +++ b/main.py @@ -7,7 +7,7 @@ import evaluate import modeling import woe -import ARUtil +import feature_index import feature_selection import math from pandas import Interval From 88c6e71d29b8c145872f32784604885b3a963caf Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Wed, 4 Jul 2018 14:15:46 +0800 Subject: [PATCH 47/49] add mutural information --- main.py | 3 +-- modeling.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index 123e722..b9b4354 100644 --- a/main.py +++ b/main.py @@ -106,8 +106,7 @@ def split_data(data_to_split, ratio): train_data, test_data = split_data(data, 0.7) model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') - predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, - 0.5, 100, 10) + predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 300, 25) pprint(list(zip(test_data['Label'].values, predict_score))) auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) print("auc值: " + str(auc)) diff --git a/modeling.py b/modeling.py index 59434e4..ed4729f 100644 --- a/modeling.py +++ b/modeling.py @@ -11,9 +11,9 @@ def model(data, fea_list, target): return cls -def score_trans(data, model, p, scaled_value, pdo): - b = pdo / np.log(2) - a = scaled_value + b * np.log(p) +def score_trans(data, model, scaled_value, pdo): + b = -pdo / np.log(2) + a = scaled_value p = model.predict_proba(data)[:, 1] score = a - np.log(p / (1 - p)) * b From 1eff0e1e15c1f76d12e04d53f3e73418d634f316 Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Thu, 5 Jul 2018 08:56:03 +0800 Subject: [PATCH 48/49] add mutural information --- main.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/main.py b/main.py index b9b4354..76d913a 100644 --- a/main.py +++ b/main.py @@ -112,8 +112,6 @@ def split_data(data_to_split, ratio): print("auc值: " + str(auc)) evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) - - # select_func = feature_selection.fea_select(data[['SepalLength', 'SepalWidth']], data['Label'], 1) # print(select_func.transform(data[['SepalLength', 'SepalWidth']])) From 8cab05d08e461a066ffc08245f223d14a44ed7ad Mon Sep 17 00:00:00 2001 From: Lansingcode <1406063770@qq.com> Date: Thu, 5 Jul 2018 11:21:26 +0800 Subject: [PATCH 49/49] add mutural information --- binning.py | 4 ++++ woe.py | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/binning.py b/binning.py index ccd6cf2..ce2a17d 100644 --- a/binning.py +++ b/binning.py @@ -104,6 +104,10 @@ def chi_merge(self, fea_name): fea_count.loc[current_fea] = fea_count.loc[current_fea] + fea_count.loc[next_fea] fea_count.drop([next_fea], inplace=True) chi_list.remove(chi_list[chi_min_index + 1]) + fea_count.index = np.append([-np.inf], fea_count.index.values[1:]) + fea_count['bin'] = pd.cut(np.append(fea_count.index.values, [np.inf]), + bins=np.append(fea_count.index.values, [np.inf]))[1:].astype(str) + fea_count.index.name = fea_name return fea_count # diff --git a/woe.py b/woe.py index c49b771..c3bb464 100644 --- a/woe.py +++ b/woe.py @@ -189,7 +189,6 @@ def add_woe_col(data, bins): """ fea_name = bins.index.name bin_index = bins.index.values.astype(float) - bin_index[0] = -np.inf bins.index = bin_index bins.index.name = fea_name bin_index = np.append(bin_index, np.inf) @@ -201,7 +200,7 @@ def add_woe_col(data, bins): if bin_index[i] == bin_index[i + 1]: continue else: - interval_list.append('(' + str(bin_index[i]) + ', ' + str(bin_index[i + 1]) + ']') + interval_list.append(bins['bin'][bin_index[i]]) rate_event = bins[0.0][bin_index[i]] / bins[0.0].sum() rate_non_event = bins[1.0][bin_index[i]] / bins[1.0].sum() if rate_event == 0.0: