From 0a9cdd35c48631a69a962113a03ccc5f44af5a13 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Wed, 20 Jun 2018 11:21:34 +0800
Subject: [PATCH 01/49] 11

---
 main.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 44d37d3..0302675 100644
--- a/main.py
+++ b/main.py
@@ -1 +1,6 @@
-# -*- coding:utf-8 -*-
\ No newline at end of file
+# -*- coding:utf-8 -*-
+
+
+
+if __name__=='__main__':
+    pass
\ No newline at end of file

From fbaba274cab4e46feb958246776267fff8cfe3f8 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Wed, 20 Jun 2018 13:41:41 +0800
Subject: [PATCH 02/49] add iris data

---
 iris.csv | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 main.py  |   8 ++-
 2 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 iris.csv

diff --git a/iris.csv b/iris.csv
new file mode 100644
index 0000000..c19b9c3
--- /dev/null
+++ b/iris.csv
@@ -0,0 +1,151 @@
+SepalLength,SepalWidth,PetalLength,PetalWidth,Name
+5.1,3.5,1.4,0.2,Iris-setosa
+4.9,3.0,1.4,0.2,Iris-setosa
+4.7,3.2,1.3,0.2,Iris-setosa
+4.6,3.1,1.5,0.2,Iris-setosa
+5.0,3.6,1.4,0.2,Iris-setosa
+5.4,3.9,1.7,0.4,Iris-setosa
+4.6,3.4,1.4,0.3,Iris-setosa
+5.0,3.4,1.5,0.2,Iris-setosa
+4.4,2.9,1.4,0.2,Iris-setosa
+4.9,3.1,1.5,0.1,Iris-setosa
+5.4,3.7,1.5,0.2,Iris-setosa
+4.8,3.4,1.6,0.2,Iris-setosa
+4.8,3.0,1.4,0.1,Iris-setosa
+4.3,3.0,1.1,0.1,Iris-setosa
+5.8,4.0,1.2,0.2,Iris-setosa
+5.7,4.4,1.5,0.4,Iris-setosa
+5.4,3.9,1.3,0.4,Iris-setosa
+5.1,3.5,1.4,0.3,Iris-setosa
+5.7,3.8,1.7,0.3,Iris-setosa
+5.1,3.8,1.5,0.3,Iris-setosa
+5.4,3.4,1.7,0.2,Iris-setosa
+5.1,3.7,1.5,0.4,Iris-setosa
+4.6,3.6,1.0,0.2,Iris-setosa
+5.1,3.3,1.7,0.5,Iris-setosa
+4.8,3.4,1.9,0.2,Iris-setosa
+5.0,3.0,1.6,0.2,Iris-setosa
+5.0,3.4,1.6,0.4,Iris-setosa
+5.2,3.5,1.5,0.2,Iris-setosa
+5.2,3.4,1.4,0.2,Iris-setosa
+4.7,3.2,1.6,0.2,Iris-setosa
+4.8,3.1,1.6,0.2,Iris-setosa
+5.4,3.4,1.5,0.4,Iris-setosa
+5.2,4.1,1.5,0.1,Iris-setosa
+5.5,4.2,1.4,0.2,Iris-setosa
+4.9,3.1,1.5,0.1,Iris-setosa
+5.0,3.2,1.2,0.2,Iris-setosa
+5.5,3.5,1.3,0.2,Iris-setosa
+4.9,3.1,1.5,0.1,Iris-setosa
+4.4,3.0,1.3,0.2,Iris-setosa
+5.1,3.4,1.5,0.2,Iris-setosa
+5.0,3.5,1.3,0.3,Iris-setosa
+4.5,2.3,1.3,0.3,Iris-setosa
+4.4,3.2,1.3,0.2,Iris-setosa
+5.0,3.5,1.6,0.6,Iris-setosa
+5.1,3.8,1.9,0.4,Iris-setosa
+4.8,3.0,1.4,0.3,Iris-setosa
+5.1,3.8,1.6,0.2,Iris-setosa
+4.6,3.2,1.4,0.2,Iris-setosa
+5.3,3.7,1.5,0.2,Iris-setosa
+5.0,3.3,1.4,0.2,Iris-setosa
+7.0,3.2,4.7,1.4,Iris-versicolor
+6.4,3.2,4.5,1.5,Iris-versicolor
+6.9,3.1,4.9,1.5,Iris-versicolor
+5.5,2.3,4.0,1.3,Iris-versicolor
+6.5,2.8,4.6,1.5,Iris-versicolor
+5.7,2.8,4.5,1.3,Iris-versicolor
+6.3,3.3,4.7,1.6,Iris-versicolor
+4.9,2.4,3.3,1.0,Iris-versicolor
+6.6,2.9,4.6,1.3,Iris-versicolor
+5.2,2.7,3.9,1.4,Iris-versicolor
+5.0,2.0,3.5,1.0,Iris-versicolor
+5.9,3.0,4.2,1.5,Iris-versicolor
+6.0,2.2,4.0,1.0,Iris-versicolor
+6.1,2.9,4.7,1.4,Iris-versicolor
+5.6,2.9,3.6,1.3,Iris-versicolor
+6.7,3.1,4.4,1.4,Iris-versicolor
+5.6,3.0,4.5,1.5,Iris-versicolor
+5.8,2.7,4.1,1.0,Iris-versicolor
+6.2,2.2,4.5,1.5,Iris-versicolor
+5.6,2.5,3.9,1.1,Iris-versicolor
+5.9,3.2,4.8,1.8,Iris-versicolor
+6.1,2.8,4.0,1.3,Iris-versicolor
+6.3,2.5,4.9,1.5,Iris-versicolor
+6.1,2.8,4.7,1.2,Iris-versicolor
+6.4,2.9,4.3,1.3,Iris-versicolor
+6.6,3.0,4.4,1.4,Iris-versicolor
+6.8,2.8,4.8,1.4,Iris-versicolor
+6.7,3.0,5.0,1.7,Iris-versicolor
+6.0,2.9,4.5,1.5,Iris-versicolor
+5.7,2.6,3.5,1.0,Iris-versicolor
+5.5,2.4,3.8,1.1,Iris-versicolor
+5.5,2.4,3.7,1.0,Iris-versicolor
+5.8,2.7,3.9,1.2,Iris-versicolor
+6.0,2.7,5.1,1.6,Iris-versicolor
+5.4,3.0,4.5,1.5,Iris-versicolor
+6.0,3.4,4.5,1.6,Iris-versicolor
+6.7,3.1,4.7,1.5,Iris-versicolor
+6.3,2.3,4.4,1.3,Iris-versicolor
+5.6,3.0,4.1,1.3,Iris-versicolor
+5.5,2.5,4.0,1.3,Iris-versicolor
+5.5,2.6,4.4,1.2,Iris-versicolor
+6.1,3.0,4.6,1.4,Iris-versicolor
+5.8,2.6,4.0,1.2,Iris-versicolor
+5.0,2.3,3.3,1.0,Iris-versicolor
+5.6,2.7,4.2,1.3,Iris-versicolor
+5.7,3.0,4.2,1.2,Iris-versicolor
+5.7,2.9,4.2,1.3,Iris-versicolor
+6.2,2.9,4.3,1.3,Iris-versicolor
+5.1,2.5,3.0,1.1,Iris-versicolor
+5.7,2.8,4.1,1.3,Iris-versicolor
+6.3,3.3,6.0,2.5,Iris-virginica
+5.8,2.7,5.1,1.9,Iris-virginica
+7.1,3.0,5.9,2.1,Iris-virginica
+6.3,2.9,5.6,1.8,Iris-virginica
+6.5,3.0,5.8,2.2,Iris-virginica
+7.6,3.0,6.6,2.1,Iris-virginica
+4.9,2.5,4.5,1.7,Iris-virginica
+7.3,2.9,6.3,1.8,Iris-virginica
+6.7,2.5,5.8,1.8,Iris-virginica
+7.2,3.6,6.1,2.5,Iris-virginica
+6.5,3.2,5.1,2.0,Iris-virginica
+6.4,2.7,5.3,1.9,Iris-virginica
+6.8,3.0,5.5,2.1,Iris-virginica
+5.7,2.5,5.0,2.0,Iris-virginica
+5.8,2.8,5.1,2.4,Iris-virginica
+6.4,3.2,5.3,2.3,Iris-virginica
+6.5,3.0,5.5,1.8,Iris-virginica
+7.7,3.8,6.7,2.2,Iris-virginica
+7.7,2.6,6.9,2.3,Iris-virginica
+6.0,2.2,5.0,1.5,Iris-virginica
+6.9,3.2,5.7,2.3,Iris-virginica
+5.6,2.8,4.9,2.0,Iris-virginica
+7.7,2.8,6.7,2.0,Iris-virginica
+6.3,2.7,4.9,1.8,Iris-virginica
+6.7,3.3,5.7,2.1,Iris-virginica
+7.2,3.2,6.0,1.8,Iris-virginica
+6.2,2.8,4.8,1.8,Iris-virginica
+6.1,3.0,4.9,1.8,Iris-virginica
+6.4,2.8,5.6,2.1,Iris-virginica
+7.2,3.0,5.8,1.6,Iris-virginica
+7.4,2.8,6.1,1.9,Iris-virginica
+7.9,3.8,6.4,2.0,Iris-virginica
+6.4,2.8,5.6,2.2,Iris-virginica
+6.3,2.8,5.1,1.5,Iris-virginica
+6.1,2.6,5.6,1.4,Iris-virginica
+7.7,3.0,6.1,2.3,Iris-virginica
+6.3,3.4,5.6,2.4,Iris-virginica
+6.4,3.1,5.5,1.8,Iris-virginica
+6.0,3.0,4.8,1.8,Iris-virginica
+6.9,3.1,5.4,2.1,Iris-virginica
+6.7,3.1,5.6,2.4,Iris-virginica
+6.9,3.1,5.1,2.3,Iris-virginica
+5.8,2.7,5.1,1.9,Iris-virginica
+6.8,3.2,5.9,2.3,Iris-virginica
+6.7,3.3,5.7,2.5,Iris-virginica
+6.7,3.0,5.2,2.3,Iris-virginica
+6.3,2.5,5.0,1.9,Iris-virginica
+6.5,3.0,5.2,2.0,Iris-virginica
+6.2,3.4,5.4,2.3,Iris-virginica
+5.9,3.0,5.1,1.8,Iris-virginica
\ No newline at end of file
diff --git a/main.py b/main.py
index 0302675..c8a05a5 100644
--- a/main.py
+++ b/main.py
@@ -1,6 +1,12 @@
 # -*- coding:utf-8 -*-
+import pandas as pd
 
+def fileIO(path):
+    data=pd.read_csv(path)
+    print(data.columns)
+    print(data.describe())
 
 
 if __name__=='__main__':
-    pass
\ No newline at end of file
+    path=input('Please input the file path: ')
+    fileIO(path)
\ No newline at end of file

From 2e826c0ef4b8604aa9b5496cade6bb883bb31b0c Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Wed, 20 Jun 2018 14:15:07 +0800
Subject: [PATCH 03/49] add file information

---
 main.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/main.py b/main.py
index c8a05a5..873f756 100644
--- a/main.py
+++ b/main.py
@@ -1,12 +1,26 @@
 # -*- coding:utf-8 -*-
+__author__ = 'xujia'
+
 import pandas as pd
 
-def fileIO(path):
-    data=pd.read_csv(path)
-    print(data.columns)
-    print(data.describe())
+
+def fileInfo(path):
+    '''
+    获取文件信息
+    :param path: 文件路径
+    :return: {字段名称：[字段类型，数据量，空值数]}
+    '''
+    infodict = {}
+    data = pd.read_csv(path)
+    for c in data.columns:
+        infodict[c] = data[c].dtype
+        ctype = data[c].dtype
+        nc = data[c].size - data[c].notnull().sum()
+        infodict[c] = [ctype, data[c].size, nc]  # 字段类型，数据量，空值个数
+    return infodict
 
 
-if __name__=='__main__':
-    path=input('Please input the file path: ')
-    fileIO(path)
\ No newline at end of file
+if __name__ == '__main__':
+    # path=input('Please input the file path: ')
+    path = 'iris.csv'
+    ret = fileInfo(path)

From 067c87c4e8ecf7c16fd20b7d8c6ff6e7daa229a5 Mon Sep 17 00:00:00 2001
From: GiantTao <wtctc@126.com>
Date: Wed, 20 Jun 2018 14:31:43 +0800
Subject: [PATCH 04/49] AR

---
 ARUtil.py | 164 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100755 ARUtil.py

diff --git a/ARUtil.py b/ARUtil.py
new file mode 100755
index 0000000..ca716ba
--- /dev/null
+++ b/ARUtil.py
@@ -0,0 +1,164 @@
+# encoding:utf-8
+import pandas as pd
+import numpy as np
+import logging
+import sys
+reload(sys)
+sys.setdefaultencoding('utf8')
+
+
+class ARFilter(object):
+
+    def __init__(self, threshold=0.05, dest_var='y'):
+        self.threshold = threshold
+        self.dest_var = dest_var
+        logging.basicConfig()
+        self.logger = logging.getLogger("default")
+        self.logger.setLevel(level=logging.INFO)
+
+    def train_cal_input(self, excel_name='input.csv'):
+        """
+        AR值筛选
+        输入：宽表【变量1、变量2、目标变量】、筛选下限（默认0.05）、目标变量名称（默认y）
+        输出：筛选后的变量列表【变量名称,AR值】（按照AR值降序排列）
+        计算方式：使用单个变量与目标变量进行逻辑回归运算，返回模型的K-S值即为该变量的AR值。
+        """
+        from sklearn.linear_model import LogisticRegression
+        from sklearn.metrics import roc_curve
+        data = pd.read_csv(excel_name)
+        # 创建逻辑回归模型
+        logit_model = LogisticRegression()
+        final_list = []
+        for col in data.columns.values[0:-1]:
+            if col != self.dest_var:
+                # 特征变量值
+                X = data[col].values.reshape(-1, 1)
+                # 拆分数据集为训练集与测试集
+                x_train = X[:-20]
+                x_test = X[-20:]
+                # 目标变量值
+                y = data[self.dest_var].values.reshape(-1, 1)
+                y_train = y[:-20]
+                y_test = y[-20:]
+                # 数据拟合
+                logit_model.fit(x_train, y_train)
+                # 每一列与y列做预测
+                # prob = logit_model.predict_proba(data[col].values.reshape(-1, 1))
+                prob = logit_model.predict_proba(x_test)
+                # prob[:, 1] 预测结果为两列，分别为0值可能性与1值可能性，此处取1值可能性
+                # fpr, tpr, thresholds = roc_curve(data[self.dest_var].values.reshape(-1, 1), prob[:, 1])
+                fpr, tpr, thresholds = roc_curve(y_test, prob[:, 1])
+                from scipy import stats
+                # AR = float(stats.ks_2samp(y_test, prob[:, 1].reshape(-1, 1)).statistic)
+                # AR = float(stats.ks_2samp(y_test.ravel(), prob[:, 1]).statistic)
+                # testDF = pd.DataFrame()
+                # testDF['predict_proba'] = prob[:,1]
+                # testDF['label'] = np.array(y_test)
+                # print self.cal_ks(testDF)
+                # print str(AR) + "-" * 30
+                ks = abs(fpr - tpr).max()
+                # print str(ks) + "*" * 30
+                # print ks
+                if ks > self.threshold:
+                    final_list.append({'varName': col, "AR": ks})
+                else:
+                    self.logger.info('列：' + col + '的AR值为:' + str(ks) + ", 低于阈值：" + str(self.threshold))
+        # AR值排序
+        final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True)
+        self.logger.info(pd.DataFrame(final_list))
+        pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False)
+
+    def cal_ks(self, data):
+        """手动计算KS值"""
+        #  对样本数据排序，根据预测值升序排序
+        sorted_list = data.sort_values(['predict_proba'], ascending=True)
+        total_good_count = sorted_list['label'].sum() * 1.0
+        total_bad_count = (sorted_list.shape[0] - total_good_count) * 1.0
+        max_ks = 0.0
+        good_count = 0.0
+        bad_count = 0.0
+        for index, row in sorted_list.iterrows():
+            if row['label'] == 0:
+                bad_count += 1.0
+            else:
+                good_count += 1.0
+            val = abs(bad_count/total_bad_count - good_count/total_good_count)
+            max_ks = max(max_ks, val)
+        return max_ks
+
+    def cal_ar(self, excel_name='test.xlsx'):
+        excel = pd.read_excel(excel_name)
+        if excel.columns.size < 2:
+            self.logger.error("未找到Excel数据源！")
+            return
+        dest_value = excel[self.dest_var]
+        final_list = []
+        # result_frame = pd.DataFrame(columns=['varName', 'AR'])
+        for col in excel.columns:
+            if col != self.dest_var:
+                AR = float(stats.ks_2samp(excel[col], dest_value).statistic)
+                final_list
+        # self.logger.info(final_list)
+        # final_list.append({'AR': 1.0, 'colName': u'var3'})
+        # final_list.append({'AR': 0.8, 'colName': u'var4'})
+        final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True)
+        # self.logger.info("final result:" + str(final_list))
+        # self.logger.info("123")
+        self.logger.info(pd.DataFrame(final_list))
+        pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False)
+
+    def fill_empty_value(self, col_name, file_name='input.xls', default_value=0):
+        """
+        缺失值填充
+        输入：宽表【变量1、变量2、目标变量】，变量名称，缺失值填充值（默认0）
+        计算方式：直接将指定变量中的缺失值用参数中的填充值进行填充
+        输出：填充后的宽表，变量缺失率
+        """
+        data = pd.read_excel(file_name)
+        # print(str(np.nan))
+        # print(type(str(np.nan)))
+        # print type(str(data['emptyCol'][14]))
+        # print len(str(data['emptyCol'][14]).strip())
+        # print type(str(data['emptyCol'][14]).strip())
+        if col_name not in data.columns.values:
+            self.logger.error("输入宽表中不存在指定变量")
+            return
+        else:
+            empty_count = data.shape[0] - data[col_name].count()
+            if empty_count > 0:
+                self.logger.info('当前共' + str(data.shape[0]) + '个变量值，其中缺失值个数为' + str(empty_count))
+                # 替换空串为NAN
+                # data[col_name] = data[col_name].replace(' ', np.nan).fillna(value=default_value)
+                data['result'] = data[col_name].replace(' ', np.nan).fillna(value=default_value)
+                # self.logger.info('填补后，缺失值个数为' + str(data.shape[0] - data[col_name].count()))
+                self.logger.info('填补后，缺失值个数为' + str(data.shape[0] - data['result'].count()))
+                data.to_excel('result.xls', index=False)
+            else:
+                self.logger.info('当前不存在缺失值')
+
+    def del_empty_value(self, file_name='input.xls', empty_rate_threshold=0.5):
+        """
+        缺失值剔除
+        输入：宽表【变量1、变量2、目标变量】，缺失率（默认0.5）
+        计算方式：计算宽表中各个变量的缺失率，并剔除缺失率超过0.5的变量
+        输出：处理后宽表
+        """
+        data = pd.read_excel(file_name)
+        for col in data.columns.values:
+            if col == 'y':
+                continue
+            empty_ratio = (data[col].shape[0] - data[col].count())/data[col].shape[0]
+            if empty_ratio >= empty_rate_threshold:
+                self.logger.info("变量：" + col + "缺失率为" + str(empty_ratio) + ",高于阈值：" + str(empty_rate_threshold))
+                data = data.drop(col, axis=1)
+        data.to_excel(file_name.split(".")[0] + "_new." + file_name.split(".")[1], index=False)
+
+
+def run():
+    ar = ARFilter()
+    ar.train_cal_input()
+    # ar.fill_empty_value(col_name='emptyCol', file_name='empty.xls', default_value=0)
+    # ar.del_empty_value(file_name="empty_ratio.xls")
+
+if __name__ == "__main__":
+    run()

From 654484dd85de8f96780f1952688a5f587182684d Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Wed, 20 Jun 2018 15:08:06 +0800
Subject: [PATCH 05/49] add file information

---
 iris.csv | 302 +++++++++++++++++++++++++++----------------------------
 1 file changed, 151 insertions(+), 151 deletions(-)

diff --git a/iris.csv b/iris.csv
index c19b9c3..1f80bbe 100644
--- a/iris.csv
+++ b/iris.csv
@@ -1,151 +1,151 @@
-SepalLength,SepalWidth,PetalLength,PetalWidth,Name
-5.1,3.5,1.4,0.2,Iris-setosa
-4.9,3.0,1.4,0.2,Iris-setosa
-4.7,3.2,1.3,0.2,Iris-setosa
-4.6,3.1,1.5,0.2,Iris-setosa
-5.0,3.6,1.4,0.2,Iris-setosa
-5.4,3.9,1.7,0.4,Iris-setosa
-4.6,3.4,1.4,0.3,Iris-setosa
-5.0,3.4,1.5,0.2,Iris-setosa
-4.4,2.9,1.4,0.2,Iris-setosa
-4.9,3.1,1.5,0.1,Iris-setosa
-5.4,3.7,1.5,0.2,Iris-setosa
-4.8,3.4,1.6,0.2,Iris-setosa
-4.8,3.0,1.4,0.1,Iris-setosa
-4.3,3.0,1.1,0.1,Iris-setosa
-5.8,4.0,1.2,0.2,Iris-setosa
-5.7,4.4,1.5,0.4,Iris-setosa
-5.4,3.9,1.3,0.4,Iris-setosa
-5.1,3.5,1.4,0.3,Iris-setosa
-5.7,3.8,1.7,0.3,Iris-setosa
-5.1,3.8,1.5,0.3,Iris-setosa
-5.4,3.4,1.7,0.2,Iris-setosa
-5.1,3.7,1.5,0.4,Iris-setosa
-4.6,3.6,1.0,0.2,Iris-setosa
-5.1,3.3,1.7,0.5,Iris-setosa
-4.8,3.4,1.9,0.2,Iris-setosa
-5.0,3.0,1.6,0.2,Iris-setosa
-5.0,3.4,1.6,0.4,Iris-setosa
-5.2,3.5,1.5,0.2,Iris-setosa
-5.2,3.4,1.4,0.2,Iris-setosa
-4.7,3.2,1.6,0.2,Iris-setosa
-4.8,3.1,1.6,0.2,Iris-setosa
-5.4,3.4,1.5,0.4,Iris-setosa
-5.2,4.1,1.5,0.1,Iris-setosa
-5.5,4.2,1.4,0.2,Iris-setosa
-4.9,3.1,1.5,0.1,Iris-setosa
-5.0,3.2,1.2,0.2,Iris-setosa
-5.5,3.5,1.3,0.2,Iris-setosa
-4.9,3.1,1.5,0.1,Iris-setosa
-4.4,3.0,1.3,0.2,Iris-setosa
-5.1,3.4,1.5,0.2,Iris-setosa
-5.0,3.5,1.3,0.3,Iris-setosa
-4.5,2.3,1.3,0.3,Iris-setosa
-4.4,3.2,1.3,0.2,Iris-setosa
-5.0,3.5,1.6,0.6,Iris-setosa
-5.1,3.8,1.9,0.4,Iris-setosa
-4.8,3.0,1.4,0.3,Iris-setosa
-5.1,3.8,1.6,0.2,Iris-setosa
-4.6,3.2,1.4,0.2,Iris-setosa
-5.3,3.7,1.5,0.2,Iris-setosa
-5.0,3.3,1.4,0.2,Iris-setosa
-7.0,3.2,4.7,1.4,Iris-versicolor
-6.4,3.2,4.5,1.5,Iris-versicolor
-6.9,3.1,4.9,1.5,Iris-versicolor
-5.5,2.3,4.0,1.3,Iris-versicolor
-6.5,2.8,4.6,1.5,Iris-versicolor
-5.7,2.8,4.5,1.3,Iris-versicolor
-6.3,3.3,4.7,1.6,Iris-versicolor
-4.9,2.4,3.3,1.0,Iris-versicolor
-6.6,2.9,4.6,1.3,Iris-versicolor
-5.2,2.7,3.9,1.4,Iris-versicolor
-5.0,2.0,3.5,1.0,Iris-versicolor
-5.9,3.0,4.2,1.5,Iris-versicolor
-6.0,2.2,4.0,1.0,Iris-versicolor
-6.1,2.9,4.7,1.4,Iris-versicolor
-5.6,2.9,3.6,1.3,Iris-versicolor
-6.7,3.1,4.4,1.4,Iris-versicolor
-5.6,3.0,4.5,1.5,Iris-versicolor
-5.8,2.7,4.1,1.0,Iris-versicolor
-6.2,2.2,4.5,1.5,Iris-versicolor
-5.6,2.5,3.9,1.1,Iris-versicolor
-5.9,3.2,4.8,1.8,Iris-versicolor
-6.1,2.8,4.0,1.3,Iris-versicolor
-6.3,2.5,4.9,1.5,Iris-versicolor
-6.1,2.8,4.7,1.2,Iris-versicolor
-6.4,2.9,4.3,1.3,Iris-versicolor
-6.6,3.0,4.4,1.4,Iris-versicolor
-6.8,2.8,4.8,1.4,Iris-versicolor
-6.7,3.0,5.0,1.7,Iris-versicolor
-6.0,2.9,4.5,1.5,Iris-versicolor
-5.7,2.6,3.5,1.0,Iris-versicolor
-5.5,2.4,3.8,1.1,Iris-versicolor
-5.5,2.4,3.7,1.0,Iris-versicolor
-5.8,2.7,3.9,1.2,Iris-versicolor
-6.0,2.7,5.1,1.6,Iris-versicolor
-5.4,3.0,4.5,1.5,Iris-versicolor
-6.0,3.4,4.5,1.6,Iris-versicolor
-6.7,3.1,4.7,1.5,Iris-versicolor
-6.3,2.3,4.4,1.3,Iris-versicolor
-5.6,3.0,4.1,1.3,Iris-versicolor
-5.5,2.5,4.0,1.3,Iris-versicolor
-5.5,2.6,4.4,1.2,Iris-versicolor
-6.1,3.0,4.6,1.4,Iris-versicolor
-5.8,2.6,4.0,1.2,Iris-versicolor
-5.0,2.3,3.3,1.0,Iris-versicolor
-5.6,2.7,4.2,1.3,Iris-versicolor
-5.7,3.0,4.2,1.2,Iris-versicolor
-5.7,2.9,4.2,1.3,Iris-versicolor
-6.2,2.9,4.3,1.3,Iris-versicolor
-5.1,2.5,3.0,1.1,Iris-versicolor
-5.7,2.8,4.1,1.3,Iris-versicolor
-6.3,3.3,6.0,2.5,Iris-virginica
-5.8,2.7,5.1,1.9,Iris-virginica
-7.1,3.0,5.9,2.1,Iris-virginica
-6.3,2.9,5.6,1.8,Iris-virginica
-6.5,3.0,5.8,2.2,Iris-virginica
-7.6,3.0,6.6,2.1,Iris-virginica
-4.9,2.5,4.5,1.7,Iris-virginica
-7.3,2.9,6.3,1.8,Iris-virginica
-6.7,2.5,5.8,1.8,Iris-virginica
-7.2,3.6,6.1,2.5,Iris-virginica
-6.5,3.2,5.1,2.0,Iris-virginica
-6.4,2.7,5.3,1.9,Iris-virginica
-6.8,3.0,5.5,2.1,Iris-virginica
-5.7,2.5,5.0,2.0,Iris-virginica
-5.8,2.8,5.1,2.4,Iris-virginica
-6.4,3.2,5.3,2.3,Iris-virginica
-6.5,3.0,5.5,1.8,Iris-virginica
-7.7,3.8,6.7,2.2,Iris-virginica
-7.7,2.6,6.9,2.3,Iris-virginica
-6.0,2.2,5.0,1.5,Iris-virginica
-6.9,3.2,5.7,2.3,Iris-virginica
-5.6,2.8,4.9,2.0,Iris-virginica
-7.7,2.8,6.7,2.0,Iris-virginica
-6.3,2.7,4.9,1.8,Iris-virginica
-6.7,3.3,5.7,2.1,Iris-virginica
-7.2,3.2,6.0,1.8,Iris-virginica
-6.2,2.8,4.8,1.8,Iris-virginica
-6.1,3.0,4.9,1.8,Iris-virginica
-6.4,2.8,5.6,2.1,Iris-virginica
-7.2,3.0,5.8,1.6,Iris-virginica
-7.4,2.8,6.1,1.9,Iris-virginica
-7.9,3.8,6.4,2.0,Iris-virginica
-6.4,2.8,5.6,2.2,Iris-virginica
-6.3,2.8,5.1,1.5,Iris-virginica
-6.1,2.6,5.6,1.4,Iris-virginica
-7.7,3.0,6.1,2.3,Iris-virginica
-6.3,3.4,5.6,2.4,Iris-virginica
-6.4,3.1,5.5,1.8,Iris-virginica
-6.0,3.0,4.8,1.8,Iris-virginica
-6.9,3.1,5.4,2.1,Iris-virginica
-6.7,3.1,5.6,2.4,Iris-virginica
-6.9,3.1,5.1,2.3,Iris-virginica
-5.8,2.7,5.1,1.9,Iris-virginica
-6.8,3.2,5.9,2.3,Iris-virginica
-6.7,3.3,5.7,2.5,Iris-virginica
-6.7,3.0,5.2,2.3,Iris-virginica
-6.3,2.5,5.0,1.9,Iris-virginica
-6.5,3.0,5.2,2.0,Iris-virginica
-6.2,3.4,5.4,2.3,Iris-virginica
-5.9,3.0,5.1,1.8,Iris-virginica
\ No newline at end of file
+SepalLength,SepalWidth,PetalLength,PetalWidth,Label
+5.1,3.5,1.4,0.2,0
+4.9,3.0,1.4,0.2,0
+4.7,3.2,1.3,0.2,0
+4.6,3.1,1.5,0.2,0
+5.0,3.6,1.4,0.2,0
+5.4,3.9,1.7,0.4,0
+4.6,3.4,1.4,0.3,0
+5.0,3.4,1.5,0.2,0
+4.4,2.9,1.4,0.2,0
+4.9,3.1,1.5,0.1,0
+5.4,3.7,1.5,0.2,0
+4.8,3.4,1.6,0.2,0
+4.8,3.0,1.4,0.1,0
+4.3,3.0,1.1,0.1,0
+5.8,4.0,1.2,0.2,0
+5.7,4.4,1.5,0.4,0
+5.4,3.9,1.3,0.4,0
+5.1,3.5,1.4,0.3,0
+5.7,3.8,1.7,0.3,0
+5.1,3.8,1.5,0.3,0
+5.4,3.4,1.7,0.2,0
+5.1,3.7,1.5,0.4,0
+4.6,3.6,1.0,0.2,0
+5.1,3.3,1.7,0.5,0
+4.8,3.4,1.9,0.2,0
+5.0,3.0,1.6,0.2,0
+5.0,3.4,1.6,0.4,0
+5.2,3.5,1.5,0.2,0
+5.2,3.4,1.4,0.2,0
+4.7,3.2,1.6,0.2,0
+4.8,3.1,1.6,0.2,0
+5.4,3.4,1.5,0.4,0
+5.2,4.1,1.5,0.1,0
+5.5,4.2,1.4,0.2,0
+4.9,3.1,1.5,0.1,0
+5.0,3.2,1.2,0.2,0
+5.5,3.5,1.3,0.2,0
+4.9,3.1,1.5,0.1,0
+4.4,3.0,1.3,0.2,0
+5.1,3.4,1.5,0.2,0
+5.0,3.5,1.3,0.3,0
+4.5,2.3,1.3,0.3,0
+4.4,3.2,1.3,0.2,0
+5.0,3.5,1.6,0.6,0
+5.1,3.8,1.9,0.4,0
+4.8,3.0,1.4,0.3,0
+5.1,3.8,1.6,0.2,0
+4.6,3.2,1.4,0.2,0
+5.3,3.7,1.5,0.2,0
+5.0,3.3,1.4,0.2,0
+7.0,3.2,4.7,1.4,1
+6.4,3.2,4.5,1.5,1
+6.9,3.1,4.9,1.5,1
+5.5,2.3,4.0,1.3,1
+6.5,2.8,4.6,1.5,1
+5.7,2.8,4.5,1.3,1
+6.3,3.3,4.7,1.6,1
+4.9,2.4,3.3,1.0,1
+6.6,2.9,4.6,1.3,1
+5.2,2.7,3.9,1.4,1
+5.0,2.0,3.5,1.0,1
+5.9,3.0,4.2,1.5,1
+6.0,2.2,4.0,1.0,1
+6.1,2.9,4.7,1.4,1
+5.6,2.9,3.6,1.3,1
+6.7,3.1,4.4,1.4,1
+5.6,3.0,4.5,1.5,1
+5.8,2.7,4.1,1.0,1
+6.2,2.2,4.5,1.5,1
+5.6,2.5,3.9,1.1,1
+5.9,3.2,4.8,1.8,1
+6.1,2.8,4.0,1.3,1
+6.3,2.5,4.9,1.5,1
+6.1,2.8,4.7,1.2,1
+6.4,2.9,4.3,1.3,1
+6.6,3.0,4.4,1.4,1
+6.8,2.8,4.8,1.4,1
+6.7,3.0,5.0,1.7,1
+6.0,2.9,4.5,1.5,1
+5.7,2.6,3.5,1.0,1
+5.5,2.4,3.8,1.1,1
+5.5,2.4,3.7,1.0,1
+5.8,2.7,3.9,1.2,1
+6.0,2.7,5.1,1.6,1
+5.4,3.0,4.5,1.5,1
+6.0,3.4,4.5,1.6,1
+6.7,3.1,4.7,1.5,1
+6.3,2.3,4.4,1.3,1
+5.6,3.0,4.1,1.3,1
+5.5,2.5,4.0,1.3,1
+5.5,2.6,4.4,1.2,1
+6.1,3.0,4.6,1.4,1
+5.8,2.6,4.0,1.2,1
+5.0,2.3,3.3,1.0,1
+5.6,2.7,4.2,1.3,1
+5.7,3.0,4.2,1.2,1
+5.7,2.9,4.2,1.3,1
+6.2,2.9,4.3,1.3,1
+5.1,2.5,3.0,1.1,1
+5.7,2.8,4.1,1.3,1
+6.3,3.3,6.0,2.5,2
+5.8,2.7,5.1,1.9,2
+7.1,3.0,5.9,2.1,2
+6.3,2.9,5.6,1.8,2
+6.5,3.0,5.8,2.2,2
+7.6,3.0,6.6,2.1,2
+4.9,2.5,4.5,1.7,2
+7.3,2.9,6.3,1.8,2
+6.7,2.5,5.8,1.8,2
+7.2,3.6,6.1,2.5,2
+6.5,3.2,5.1,2.0,2
+6.4,2.7,5.3,1.9,2
+6.8,3.0,5.5,2.1,2
+5.7,2.5,5.0,2.0,2
+5.8,2.8,5.1,2.4,2
+6.4,3.2,5.3,2.3,2
+6.5,3.0,5.5,1.8,2
+7.7,3.8,6.7,2.2,2
+7.7,2.6,6.9,2.3,2
+6.0,2.2,5.0,1.5,2
+6.9,3.2,5.7,2.3,2
+5.6,2.8,4.9,2.0,2
+7.7,2.8,6.7,2.0,2
+6.3,2.7,4.9,1.8,2
+6.7,3.3,5.7,2.1,2
+7.2,3.2,6.0,1.8,2
+6.2,2.8,4.8,1.8,2
+6.1,3.0,4.9,1.8,2
+6.4,2.8,5.6,2.1,2
+7.2,3.0,5.8,1.6,2
+7.4,2.8,6.1,1.9,2
+7.9,3.8,6.4,2.0,2
+6.4,2.8,5.6,2.2,2
+6.3,2.8,5.1,1.5,2
+6.1,2.6,5.6,1.4,2
+7.7,3.0,6.1,2.3,2
+6.3,3.4,5.6,2.4,2
+6.4,3.1,5.5,1.8,2
+6.0,3.0,4.8,1.8,2
+6.9,3.1,5.4,2.1,2
+6.7,3.1,5.6,2.4,2
+6.9,3.1,5.1,2.3,2
+5.8,2.7,5.1,1.9,2
+6.8,3.2,5.9,2.3,2
+6.7,3.3,5.7,2.5,2
+6.7,3.0,5.2,2.3,2
+6.3,2.5,5.0,1.9,2
+6.5,3.0,5.2,2.0,2
+6.2,3.4,5.4,2.3,2
+5.9,3.0,5.1,1.8,2
\ No newline at end of file

From 72c28ae051b9e150cdba7d0ba1118894f9f43424 Mon Sep 17 00:00:00 2001
From: GiantTao <wtctc@126.com>
Date: Wed, 20 Jun 2018 16:36:35 +0800
Subject: [PATCH 06/49] Update ARUtil.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

注释编码
---
 ARUtil.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ARUtil.py b/ARUtil.py
index ca716ba..ed05eec 100755
--- a/ARUtil.py
+++ b/ARUtil.py
@@ -2,9 +2,9 @@
 import pandas as pd
 import numpy as np
 import logging
-import sys
-reload(sys)
-sys.setdefaultencoding('utf8')
+# import sys
+# reload(sys)
+# sys.setdefaultencoding('utf8')
 
 
 class ARFilter(object):

From 89457ac9b72f88c96813dc76fcb0ade13731fcb9 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Wed, 20 Jun 2018 16:57:05 +0800
Subject: [PATCH 07/49] add file data split

---
 main.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/main.py b/main.py
index 873f756..3d360a4 100644
--- a/main.py
+++ b/main.py
@@ -2,13 +2,14 @@
 __author__ = 'xujia'
 
 import pandas as pd
+import numpy as np
 
 
 def fileInfo(path):
     '''
     获取文件信息
     :param path: 文件路径
-    :return: {字段名称：[字段类型，数据量，空值数]}
+    :return: {字段名称：[字段类型，数据量，空值个数]}
     '''
     infodict = {}
     data = pd.read_csv(path)
@@ -17,10 +18,29 @@ def fileInfo(path):
         ctype = data[c].dtype
         nc = data[c].size - data[c].notnull().sum()
         infodict[c] = [ctype, data[c].size, nc]  # 字段类型，数据量，空值个数
-    return infodict
+    return infodict, data
+
+
+def dataSplit(data, ratio):
+    '''
+    数据分割
+    :param data:带分割数据
+    :param ratio: 分割比例
+    :return: （数据集1，数据集2）
+    '''
+    dataCount = data.shape[0]
+    selectedCount = int(dataCount * ratio)
+    if selectedCount > 0:
+        splitedData = np.split(data.sample(frac=1), [selectedCount], axis=0)
+    else:
+        return 'Data is too less'
+    return splitedData
 
 
 if __name__ == '__main__':
     # path=input('Please input the file path: ')
     path = 'iris.csv'
-    ret = fileInfo(path)
+    dict, data = fileInfo(path)
+    t = dataSplit(data, 0.8)
+    print(t[0])
+    print(t[1])

From de96725dcc0dec677973d95bd6313bb62e9abd6e Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Wed, 20 Jun 2018 17:00:55 +0800
Subject: [PATCH 08/49] add file data split

---
 iris.csv | 52 +---------------------------------------------------
 1 file changed, 1 insertion(+), 51 deletions(-)

diff --git a/iris.csv b/iris.csv
index 1f80bbe..2b6058e 100644
--- a/iris.csv
+++ b/iris.csv
@@ -98,54 +98,4 @@ SepalLength,SepalWidth,PetalLength,PetalWidth,Label
 5.7,2.9,4.2,1.3,1
 6.2,2.9,4.3,1.3,1
 5.1,2.5,3.0,1.1,1
-5.7,2.8,4.1,1.3,1
-6.3,3.3,6.0,2.5,2
-5.8,2.7,5.1,1.9,2
-7.1,3.0,5.9,2.1,2
-6.3,2.9,5.6,1.8,2
-6.5,3.0,5.8,2.2,2
-7.6,3.0,6.6,2.1,2
-4.9,2.5,4.5,1.7,2
-7.3,2.9,6.3,1.8,2
-6.7,2.5,5.8,1.8,2
-7.2,3.6,6.1,2.5,2
-6.5,3.2,5.1,2.0,2
-6.4,2.7,5.3,1.9,2
-6.8,3.0,5.5,2.1,2
-5.7,2.5,5.0,2.0,2
-5.8,2.8,5.1,2.4,2
-6.4,3.2,5.3,2.3,2
-6.5,3.0,5.5,1.8,2
-7.7,3.8,6.7,2.2,2
-7.7,2.6,6.9,2.3,2
-6.0,2.2,5.0,1.5,2
-6.9,3.2,5.7,2.3,2
-5.6,2.8,4.9,2.0,2
-7.7,2.8,6.7,2.0,2
-6.3,2.7,4.9,1.8,2
-6.7,3.3,5.7,2.1,2
-7.2,3.2,6.0,1.8,2
-6.2,2.8,4.8,1.8,2
-6.1,3.0,4.9,1.8,2
-6.4,2.8,5.6,2.1,2
-7.2,3.0,5.8,1.6,2
-7.4,2.8,6.1,1.9,2
-7.9,3.8,6.4,2.0,2
-6.4,2.8,5.6,2.2,2
-6.3,2.8,5.1,1.5,2
-6.1,2.6,5.6,1.4,2
-7.7,3.0,6.1,2.3,2
-6.3,3.4,5.6,2.4,2
-6.4,3.1,5.5,1.8,2
-6.0,3.0,4.8,1.8,2
-6.9,3.1,5.4,2.1,2
-6.7,3.1,5.6,2.4,2
-6.9,3.1,5.1,2.3,2
-5.8,2.7,5.1,1.9,2
-6.8,3.2,5.9,2.3,2
-6.7,3.3,5.7,2.5,2
-6.7,3.0,5.2,2.3,2
-6.3,2.5,5.0,1.9,2
-6.5,3.0,5.2,2.0,2
-6.2,3.4,5.4,2.3,2
-5.9,3.0,5.1,1.8,2
\ No newline at end of file
+5.7,2.8,4.1,1.3,1
\ No newline at end of file

From 20153a0003f8e04b15d329e4dee3be43e4332946 Mon Sep 17 00:00:00 2001
From: GiantTao <wtctc@126.com>
Date: Thu, 21 Jun 2018 08:42:26 +0800
Subject: [PATCH 09/49] =?UTF-8?q?=E5=89=94=E9=99=A4=E9=83=A8=E5=88=86?=
 =?UTF-8?q?=E5=AD=97=E6=AE=B5=EF=BC=8C=E6=8F=90=E4=BE=9B=E7=BC=BA=E5=A4=B1?=
 =?UTF-8?q?=E5=80=BC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 iris.csv | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/iris.csv b/iris.csv
index 2b6058e..a320fb3 100644
--- a/iris.csv
+++ b/iris.csv
@@ -1,18 +1,18 @@
 SepalLength,SepalWidth,PetalLength,PetalWidth,Label
-5.1,3.5,1.4,0.2,0
-4.9,3.0,1.4,0.2,0
-4.7,3.2,1.3,0.2,0
-4.6,3.1,1.5,0.2,0
-5.0,3.6,1.4,0.2,0
-5.4,3.9,1.7,0.4,0
-4.6,3.4,1.4,0.3,0
+5.1,3.5,,0.2,0
+4.9,3.0,,0.2,0
+4.7,3.2,,0.2,0
+4.6,3.1,,0.2,0
+5.0,3.6,,0.2,0
+5.4,3.9,,0.4,0
+4.6,3.4,,0.3,0
 5.0,3.4,1.5,0.2,0
 4.4,2.9,1.4,0.2,0
 4.9,3.1,1.5,0.1,0
 5.4,3.7,1.5,0.2,0
 4.8,3.4,1.6,0.2,0
 4.8,3.0,1.4,0.1,0
-4.3,3.0,1.1,0.1,0
+4.3,,1.1,0.1,0
 5.8,4.0,1.2,0.2,0
 5.7,4.4,1.5,0.4,0
 5.4,3.9,1.3,0.4,0

From 823958a3170820b6d1b81099d10a852b77bb7efa Mon Sep 17 00:00:00 2001
From: GiantTao <wtctc@126.com>
Date: Thu, 21 Jun 2018 10:27:33 +0800
Subject: [PATCH 10/49] =?UTF-8?q?=E6=B5=81=E7=A8=8B=E5=8C=96=E5=A4=84?=
 =?UTF-8?q?=E7=90=86=E6=96=87=E4=BB=B6=E8=AF=BB=E5=8F=96=E3=80=81=E7=A9=BA?=
 =?UTF-8?q?=E5=80=BC=E5=A1=AB=E5=85=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ARUtil.py | 90 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 73 insertions(+), 17 deletions(-)

diff --git a/ARUtil.py b/ARUtil.py
index ed05eec..34ed408 100755
--- a/ARUtil.py
+++ b/ARUtil.py
@@ -107,43 +107,36 @@ def cal_ar(self, excel_name='test.xlsx'):
         self.logger.info(pd.DataFrame(final_list))
         pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False)
 
-    def fill_empty_value(self, col_name, file_name='input.xls', default_value=0):
+    def fill_empty_value(self, col_name, data, default_value=0):
         """
         缺失值填充
         输入：宽表【变量1、变量2、目标变量】，变量名称，缺失值填充值（默认0）
         计算方式：直接将指定变量中的缺失值用参数中的填充值进行填充
         输出：填充后的宽表，变量缺失率
         """
-        data = pd.read_excel(file_name)
-        # print(str(np.nan))
-        # print(type(str(np.nan)))
-        # print type(str(data['emptyCol'][14]))
-        # print len(str(data['emptyCol'][14]).strip())
-        # print type(str(data['emptyCol'][14]).strip())
+        # data = pd.read_excel(file_name)
         if col_name not in data.columns.values:
             self.logger.error("输入宽表中不存在指定变量")
             return
         else:
-            empty_count = data.shape[0] - data[col_name].count()
+            empty_count = data[col_name].shape[0] - data[col_name].count()
             if empty_count > 0:
                 self.logger.info('当前共' + str(data.shape[0]) + '个变量值，其中缺失值个数为' + str(empty_count))
                 # 替换空串为NAN
-                # data[col_name] = data[col_name].replace(' ', np.nan).fillna(value=default_value)
-                data['result'] = data[col_name].replace(' ', np.nan).fillna(value=default_value)
-                # self.logger.info('填补后，缺失值个数为' + str(data.shape[0] - data[col_name].count()))
-                self.logger.info('填补后，缺失值个数为' + str(data.shape[0] - data['result'].count()))
-                data.to_excel('result.xls', index=False)
+                data[col_name] = data[col_name].replace(' ', np.nan).fillna(value=default_value)
+                self.logger.info('填补后，缺失值个数为' + str(data[col_name].shape[0] - data[col_name].count()))
+                # data.to_excel('result.xls', index=False)
+                return data
             else:
                 self.logger.info('当前不存在缺失值')
 
-    def del_empty_value(self, file_name='input.xls', empty_rate_threshold=0.5):
+    def del_empty_value(self, data, empty_rate_threshold=0.5):
         """
         缺失值剔除
         输入：宽表【变量1、变量2、目标变量】，缺失率（默认0.5）
         计算方式：计算宽表中各个变量的缺失率，并剔除缺失率超过0.5的变量
         输出：处理后宽表
         """
-        data = pd.read_excel(file_name)
         for col in data.columns.values:
             if col == 'y':
                 continue
@@ -151,14 +144,77 @@ def del_empty_value(self, file_name='input.xls', empty_rate_threshold=0.5):
             if empty_ratio >= empty_rate_threshold:
                 self.logger.info("变量：" + col + "缺失率为" + str(empty_ratio) + ",高于阈值：" + str(empty_rate_threshold))
                 data = data.drop(col, axis=1)
-        data.to_excel(file_name.split(".")[0] + "_new." + file_name.split(".")[1], index=False)
+        return data
+        # data.to_excel(file_name.split(".")[0] + "_new." + file_name.split(".")[1], index=False)
+
+    def console_input(self, prompt="", if_value=[], else_value=[], if_rtn="", else_rtn=""):
+        rtn = input(prompt)
+        if rtn.strip() in if_value:
+            return if_rtn
+        elif rtn.strip() in else_value or len(else_value) == 0:
+            return else_rtn
+        else:
+            raise IOError("未匹配到条件")
+
+    def file_info(self, path):
+        """
+        获取文件信息
+        :param path: 文件路径
+        :return: {字段名称：[字段类型，数据量，空值个数]}
+        """
+        info_dict = {}
+        data = pd.read_csv(path)
+        for c in data.columns:
+            ctype = data[c].dtype
+            nc = data[c].size - data[c].notnull().sum()
+            info_dict[c] = [ctype, data[c].size, nc]  # 字段类型，数据量，空值个数
+        return info_dict, data
+
+    def is_contain_empty_value(self, file_dict):
+        empty_col_list = []
+        for item in file_dict:
+            self.logger.info(file_dict[item])
+            if int(file_dict[item][2]) > 0:
+                self.logger.info("列" + item + "空值个数：" + str(file_dict[item][2]))
+                empty_col_list.append(item)
+        if len(empty_col_list) > 0:
+            return True, empty_col_list
+        else:
+            return False, []
+
+    def main(self):
+        file_path = input("请输入待处理的文件名路径：")
+        import os.path
+        if os.path.isfile(file_path):
+            file_dict, data = self.file_info(file_path)
+            is_contain_empty_value, empty_col_list = self.is_contain_empty_value(file_dict)
+            if is_contain_empty_value:
+                self.logger.info("当前存在缺失值")
+                is_fill_empty = self.console_input(prompt="是否需要填充数据？1：是，其他值：否", if_value=["1"], else_value=[],
+                                                   if_rtn=True, else_rtn=False)
+                if is_fill_empty:
+                    for col in empty_col_list:
+                        fill_value = input("请输入列" + col + "待填充的数据：")
+                        self.logger.info("列" + col + "将填充数据：" + fill_value)
+                        data = self.fill_empty_value(col_name=col, data=data, default_value=fill_value)
+                    print(data)
+                else:
+                    self.logger.info("不填充数据，程序退出")
+            else:
+                self.logger.info("当前不存在缺失数据")
+        else:
+            self.logger.error("指定的文件路径不存在")
 
 
 def run():
     ar = ARFilter()
-    ar.train_cal_input()
+    # ar.train_cal_input()
     # ar.fill_empty_value(col_name='emptyCol', file_name='empty.xls', default_value=0)
     # ar.del_empty_value(file_name="empty_ratio.xls")
+    ar.main()
+
 
 if __name__ == "__main__":
+
     run()
+

From 189d1422b249932db8489459d0a48e2621b3cb83 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Thu, 21 Jun 2018 10:36:58 +0800
Subject: [PATCH 11/49] add file data split

---
 main.py | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/main.py b/main.py
index 3d360a4..0c684a0 100644
--- a/main.py
+++ b/main.py
@@ -14,20 +14,47 @@ def fileInfo(path):
     infodict = {}
     data = pd.read_csv(path)
     for c in data.columns:
-        infodict[c] = data[c].dtype
         ctype = data[c].dtype
         nc = data[c].size - data[c].notnull().sum()
         infodict[c] = [ctype, data[c].size, nc]  # 字段类型，数据量，空值个数
     return infodict, data
 
 
-def dataSplit(data, ratio):
+def changeType(df, featypedict):
+    typedict = {1: 'float64', 2: 'int64', 3: 'str'}
+    feadict = dict(zip(list(range(df.shape[1])), df.columns.values))
+
+    print('当前数据类型为：')
+    for (k, v) in featypedict.items():
+        print(k.rjust(15), v[0])
+    print('字段名称对应数字为：')
+    for (n, m) in feadict.items():
+        print(n, m)
+    feaName = input('请输入如需要更改数据类型的字段对应的数字：')
+    if int(feaName) not in feadict.keys():
+        feaName = input('输入字段名称错误，请重新输入：')
+        if int(feaName) not in feadict.keys():
+            pass
+    feaName = feadict[int(feaName)]
+
+    type = input('请输入目标类型对应的数字(1: 浮点型(float64)，2: 整型(int64)，3: 字符型(str)：')
+    if int(type) not in typedict.keys():
+        type = input('请输入目标类型对应的数字(1: 浮点型(float64)，2: 整型(int64)，3: 字符型(str)：')
+        if int(type) not in typedict.keys():
+            pass
+    type = typedict[int(type)]
+
+    df[feaName] = df[feaName].astype(type)
+
+
+def dataSplit(data):
     '''
     数据分割
     :param data:带分割数据
     :param ratio: 分割比例
     :return: （数据集1，数据集2）
     '''
+    ratio = float(input('请输入数据分割比例：'))
     dataCount = data.shape[0]
     selectedCount = int(dataCount * ratio)
     if selectedCount > 0:
@@ -40,7 +67,11 @@ def dataSplit(data, ratio):
 if __name__ == '__main__':
     # path=input('Please input the file path: ')
     path = 'iris.csv'
-    dict, data = fileInfo(path)
-    t = dataSplit(data, 0.8)
-    print(t[0])
-    print(t[1])
+    feadict, data = fileInfo(path)
+
+    changeType(data, feadict)
+    print(data.dtypes)
+
+    t = dataSplit(data)
+    print(t[0].shape)
+    print(t[1].shape)

From d3d19ebd0e173e1f78782ca57b16819b143921f8 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Thu, 21 Jun 2018 10:44:12 +0800
Subject: [PATCH 12/49] add file data split

---
 ARUtil.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/ARUtil.py b/ARUtil.py
index 34ed408..c898e01 100755
--- a/ARUtil.py
+++ b/ARUtil.py
@@ -16,6 +16,20 @@ def __init__(self, threshold=0.05, dest_var='y'):
         self.logger = logging.getLogger("default")
         self.logger.setLevel(level=logging.INFO)
 
+    def info_value(self):
+        """
+        信息熵
+        :return:
+        """
+        pass
+
+    def chi_square(self):
+        """
+        卡方
+        :return:
+        """
+        pass
+
     def train_cal_input(self, excel_name='input.csv'):
         """
         AR值筛选

From 76e290ef3e43032efde8560dabd9c9645e4ade0e Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Thu, 21 Jun 2018 11:27:02 +0800
Subject: [PATCH 13/49] add file data split

---
 main.py | 84 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 44 insertions(+), 40 deletions(-)

diff --git a/main.py b/main.py
index 0c684a0..254cdde 100644
--- a/main.py
+++ b/main.py
@@ -5,73 +5,77 @@
 import numpy as np
 
 
-def fileInfo(path):
-    '''
+def file_info(file_path):
+    """
     获取文件信息
-    :param path: 文件路径
+    :param file_path: 文件路径
     :return: {字段名称：[字段类型，数据量，空值个数]}
-    '''
-    infodict = {}
-    data = pd.read_csv(path)
-    for c in data.columns:
-        ctype = data[c].dtype
-        nc = data[c].size - data[c].notnull().sum()
-        infodict[c] = [ctype, data[c].size, nc]  # 字段类型，数据量，空值个数
-    return infodict, data
+    """
+    info_dict = {}
+    raw_data = pd.read_csv(file_path)
+    for c in raw_data.columns:
+        c_type = raw_data[c].dtype
+        nc = raw_data[c].size - raw_data[c].notnull().sum()
+        info_dict[c] = [c_type, raw_data[c].size, nc]  # 字段类型，数据量，空值个数
+    return info_dict, data
 
 
-def changeType(df, featypedict):
-    typedict = {1: 'float64', 2: 'int64', 3: 'str'}
-    feadict = dict(zip(list(range(df.shape[1])), df.columns.values))
+def change_type(df, fea_type_dict):
+    """
+    改变数据类型
+    :param df:
+    :param fea_type_dict:
+    :return:
+    """
+    type_dict = {1: 'float64', 2: 'int64', 3: 'str'}
+    fea_dict = dict(zip(list(range(df.shape[1])), df.columns.values))
 
     print('当前数据类型为：')
-    for (k, v) in featypedict.items():
+    for (k, v) in fea_type_dict.items():
         print(k.rjust(15), v[0])
     print('字段名称对应数字为：')
     for (n, m) in feadict.items():
         print(n, m)
-    feaName = input('请输入如需要更改数据类型的字段对应的数字：')
-    if int(feaName) not in feadict.keys():
-        feaName = input('输入字段名称错误，请重新输入：')
-        if int(feaName) not in feadict.keys():
+    fea_name = int(input('请输入如需要更改数据类型的字段对应的数字：'))
+    if fea_name not in feadict.keys():
+        fea_name = int(input('输入字段名称错误，请重新输入：'))
+        if fea_name not in feadict.keys():
             pass
-    feaName = feadict[int(feaName)]
+    fea_name = fea_dict[fea_name]
 
-    type = input('请输入目标类型对应的数字(1: 浮点型(float64)，2: 整型(int64)，3: 字符型(str)：')
-    if int(type) not in typedict.keys():
-        type = input('请输入目标类型对应的数字(1: 浮点型(float64)，2: 整型(int64)，3: 字符型(str)：')
-        if int(type) not in typedict.keys():
+    target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64)，2: 整型(int64)，3: 字符型(str)：'))
+    if target_type not in type_dict.keys():
+        target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64)，2: 整型(int64)，3: 字符型(str)：'))
+        if target_type not in type_dict.keys():
             pass
-    type = typedict[int(type)]
+    type = type_dict[target_type]
+    df[fea_name] = df[fea_name].astype(type)
 
-    df[feaName] = df[feaName].astype(type)
 
-
-def dataSplit(data):
-    '''
+def data_split(data_to_split):
+    """
     数据分割
-    :param data:带分割数据
-    :param ratio: 分割比例
+    :param data_to_split:带分割数据
     :return: （数据集1，数据集2）
-    '''
+    """
     ratio = float(input('请输入数据分割比例：'))
-    dataCount = data.shape[0]
-    selectedCount = int(dataCount * ratio)
-    if selectedCount > 0:
-        splitedData = np.split(data.sample(frac=1), [selectedCount], axis=0)
+    data_count = data_to_split.shape[0]
+    selected_count = int(data_count * ratio)
+    if selected_count > 0:
+        split_data = np.split(data.sample(frac=1), [selected_count], axis=0)
     else:
         return 'Data is too less'
-    return splitedData
+    return split_data
 
 
 if __name__ == '__main__':
     # path=input('Please input the file path: ')
     path = 'iris.csv'
-    feadict, data = fileInfo(path)
+    feadict, data = file_info(path)
 
-    changeType(data, feadict)
+    change_type(data, feadict)
     print(data.dtypes)
 
-    t = dataSplit(data)
+    t = data_split(data)
     print(t[0].shape)
     print(t[1].shape)

From 1262418e60a50fcafd2c15104f1030a391522f70 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Thu, 21 Jun 2018 11:30:53 +0800
Subject: [PATCH 14/49] add file data split

---
 main.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/main.py b/main.py
index 254cdde..10f4322 100644
--- a/main.py
+++ b/main.py
@@ -28,31 +28,31 @@ def change_type(df, fea_type_dict):
     :return:
     """
     type_dict = {1: 'float64', 2: 'int64', 3: 'str'}
-    fea_dict = dict(zip(list(range(df.shape[1])), df.columns.values))
+    feature_dict = dict(zip(list(range(df.shape[1])), df.columns.values))
 
     print('当前数据类型为：')
     for (k, v) in fea_type_dict.items():
         print(k.rjust(15), v[0])
     print('字段名称对应数字为：')
-    for (n, m) in feadict.items():
+    for (n, m) in feature_dict.items():
         print(n, m)
     fea_name = int(input('请输入如需要更改数据类型的字段对应的数字：'))
-    if fea_name not in feadict.keys():
+    if fea_name not in feature_dict.keys():
         fea_name = int(input('输入字段名称错误，请重新输入：'))
-        if fea_name not in feadict.keys():
+        if fea_name not in fea_dict.keys():
             pass
-    fea_name = fea_dict[fea_name]
+    fea_name = feature_dict[fea_name]
 
     target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64)，2: 整型(int64)，3: 字符型(str)：'))
     if target_type not in type_dict.keys():
         target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64)，2: 整型(int64)，3: 字符型(str)：'))
         if target_type not in type_dict.keys():
             pass
-    type = type_dict[target_type]
-    df[fea_name] = df[fea_name].astype(type)
+    target_type = type_dict[target_type]
+    df[fea_name] = df[fea_name].astype(target_type)
 
 
-def data_split(data_to_split):
+def split_data(data_to_split):
     """
     数据分割
     :param data_to_split:带分割数据
@@ -71,11 +71,11 @@ def data_split(data_to_split):
 if __name__ == '__main__':
     # path=input('Please input the file path: ')
     path = 'iris.csv'
-    feadict, data = file_info(path)
+    fea_dict, data = file_info(path)
 
-    change_type(data, feadict)
+    change_type(data, fea_dict)
     print(data.dtypes)
 
-    t = data_split(data)
+    t = split_data(data)
     print(t[0].shape)
     print(t[1].shape)

From a52e2d2fb76cd6eaf444373e1cb2c177c4e0f1d7 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Thu, 21 Jun 2018 13:27:10 +0800
Subject: [PATCH 15/49] add file data split

---
 main.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/main.py b/main.py
index 10f4322..f940fa7 100644
--- a/main.py
+++ b/main.py
@@ -17,7 +17,7 @@ def file_info(file_path):
         c_type = raw_data[c].dtype
         nc = raw_data[c].size - raw_data[c].notnull().sum()
         info_dict[c] = [c_type, raw_data[c].size, nc]  # 字段类型，数据量，空值个数
-    return info_dict, data
+    return info_dict, raw_data
 
 
 def change_type(df, fea_type_dict):
@@ -33,6 +33,7 @@ def change_type(df, fea_type_dict):
     print('当前数据类型为：')
     for (k, v) in fea_type_dict.items():
         print(k.rjust(15), v[0])
+
     print('字段名称对应数字为：')
     for (n, m) in feature_dict.items():
         print(n, m)
@@ -62,10 +63,10 @@ def split_data(data_to_split):
     data_count = data_to_split.shape[0]
     selected_count = int(data_count * ratio)
     if selected_count > 0:
-        split_data = np.split(data.sample(frac=1), [selected_count], axis=0)
+        splited_data = np.split(data.sample(frac=1), [selected_count], axis=0)
     else:
         return 'Data is too less'
-    return split_data
+    return splited_data
 
 
 if __name__ == '__main__':

From f57615a055076e51d57c78fca5694a0b8085c384 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Thu, 21 Jun 2018 13:45:39 +0800
Subject: [PATCH 16/49] add binning

---
 ARUtil.py  |  3 +--
 binning.py | 34 ++++++++++++++++++++++++++++++++++
 bins.py    |  1 -
 3 files changed, 35 insertions(+), 3 deletions(-)
 create mode 100644 binning.py
 delete mode 100644 bins.py

diff --git a/ARUtil.py b/ARUtil.py
index c898e01..d8e8df5 100755
--- a/ARUtil.py
+++ b/ARUtil.py
@@ -204,8 +204,7 @@ def main(self):
             is_contain_empty_value, empty_col_list = self.is_contain_empty_value(file_dict)
             if is_contain_empty_value:
                 self.logger.info("当前存在缺失值")
-                is_fill_empty = self.console_input(prompt="是否需要填充数据？1：是，其他值：否", if_value=["1"], else_value=[],
-                                                   if_rtn=True, else_rtn=False)
+                is_fill_empty = self.console_input(prompt="是否需要填充数据？1：是，其他值：否", if_value=["1"], else_value=[],if_rtn=True, else_rtn=False)
                 if is_fill_empty:
                     for col in empty_col_list:
                         fill_value = input("请输入列" + col + "待填充的数据：")
diff --git a/binning.py b/binning.py
new file mode 100644
index 0000000..7974076
--- /dev/null
+++ b/binning.py
@@ -0,0 +1,34 @@
+# -*- coding:utf-8 -*-
+__author__ = 'xujia'
+
+import pandas as pd
+
+
+def equal_distance_binning(data, fea_name):
+    """
+    等距分箱
+    :param data:
+    :param fea_name:
+    :return:
+    """
+    pass
+
+
+def equal_frequency_binning(data, fea_name):
+    """
+    等频分箱
+    :param data:
+    :param fea_name:
+    :return:
+    """
+    pass
+
+
+def auto_binning(data, fea_name):
+    """
+    自动分箱
+    :param data:
+    :param fea_name:
+    :return:
+    """
+    pass
diff --git a/bins.py b/bins.py
deleted file mode 100644
index 44d37d3..0000000
--- a/bins.py
+++ /dev/null
@@ -1 +0,0 @@
-# -*- coding:utf-8 -*-
\ No newline at end of file

From e379cb9d2b414e69d69ddbe2b9c7b2da775c968b Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Thu, 21 Jun 2018 14:03:08 +0800
Subject: [PATCH 17/49] add binning

---
 WOE.py     | 182 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 binning.py |  18 +++---
 2 files changed, 191 insertions(+), 9 deletions(-)
 create mode 100644 WOE.py

diff --git a/WOE.py b/WOE.py
new file mode 100644
index 0000000..5858e2b
--- /dev/null
+++ b/WOE.py
@@ -0,0 +1,182 @@
+# -*- coding:utf-8 -*-
+
+import pandas as pd
+from math import log
+import numpy as np
+import math
+from scipy import stats
+from sklearn.utils.multiclass import type_of_target
+
+
+class WOE:
+    def __init__(self):
+        self._WOE_MIN = -20
+        self._WOE_MAX = 20
+
+    def woe(self, X, y, event=1):
+        '''
+        Calculate woe of each feature category and information value
+        :param X: 2-D numpy array explanatory features which should be discreted already
+        :param y: 1-D numpy array target variable which should be binary
+        :param event: value of binary stands for the event to predict
+        :return: numpy array of woe dictionaries, each dictionary contains woe values for categories of each feature
+                 numpy array of information value of each feature
+        '''
+        self.check_target_binary(y)
+        X1 = self.feature_discretion(X)
+
+        res_woe = []
+        res_iv = []
+        for i in range(0, X1.shape[-1]):
+            x = X1[:, i]
+            woe_dict, iv1 = self.woe_single_x(x, y, event)
+            res_woe.append(woe_dict)
+            res_iv.append(iv1)
+        return np.array(res_woe), np.array(res_iv)
+
+    def woe_single_x(self, x, y, event=1):
+        """
+        calculate woe and information for a single feature
+        :param x: 1-D numpy starnds for single feature
+        :param y: 1-D numpy array target variable
+        :param event: value of binary stands for the event to predict
+        :return: dictionary contains woe values for categories of this feature information value of this feature
+        """
+        self.check_target_binary(y)
+
+        event_total, non_event_total = self.count_binary(y, event=event)
+        x_labels = np.unique(x)
+        woe_dict = {}
+        iv = 0
+        for x1 in x_labels:
+            y1 = y[np.where(x == x1)[0]]
+            event_count, non_event_count = self.count_binary(y1, event=event)
+            rate_event = 1.0 * event_count / event_total
+            rate_non_event = 1.0 * non_event_count / non_event_total
+            if rate_event == 0:
+                woe1 = self._WOE_MIN
+            elif rate_non_event == 0:
+                woe1 = self._WOE_MAX
+            else:
+                woe1 = math.log(rate_event / rate_non_event)
+            woe_dict[x1] = woe1
+            iv += (rate_event - rate_non_event) * woe1
+        return woe_dict, iv
+
+    def woe_replace(self, X, woe_arr):
+        """
+        replace the explanatory feature categories with its woe value
+        :param X: 2-D numpy array explanatory features which should be discreted already
+        :param woe_arr: numpy array of woe dictionaries, each dictionary contains woe values for categories of each feature
+        :return: the new numpy array in which woe values filled
+        """
+        if X.shape[-1] != woe_arr.shape[-1]:
+            raise ValueError('WOE dict array length must be equal with features length')
+
+        res = np.copy(X).astype(float)
+        idx = 0
+        for woe_dict in woe_arr:
+            for k in woe_dict.keys():
+                woe = woe_dict[k]
+                res[:, idx][np.where(res[:, idx] == k)[0]] = woe * 1.0
+            idx += 1
+
+        return res
+
+    def combined_iv(self, X, y, masks, event=1):
+        """
+        calcute the information vlaue of combination features
+        :param X: 2-D numpy array explanatory features which should be discreted already
+        :param y: 1-D numpy array target variable
+        :param masks: 1-D numpy array of masks stands for which features are included in combination,
+                      e.g. np.array([0,0,1,1,1,0,0,0,0,0,1]), the length should be same as features length
+        :param event: value of binary stands for the event to predict
+        :return: woe dictionary and information value of combined features
+        """
+        if masks.shape[-1] != X.shape[-1]:
+            raise ValueError('Masks array length must be equal with features length')
+
+        x = X[:, np.where(masks == 1)[0]]
+        tmp = []
+        for i in range(x.shape[0]):
+            tmp.append(self.combine(x[i, :]))
+
+        dumy = np.array(tmp)
+        # dumy_labels = np.unique(dumy)
+        woe, iv = self.woe_single_x(dumy, y, event)
+        return woe, iv
+
+    def combine(self, list):
+        res = ''
+        for item in list:
+            res += str(item)
+        return res
+
+    def count_binary(self, a, event=1):
+        event_count = (a == event).sum()
+        non_event_count = a.shape[-1] - event_count
+        return event_count, non_event_count
+
+    def check_target_binary(self, y):
+        """
+        check if the target variable is binary, raise error if not.
+        :param y:
+        :return:
+        """
+        y_type = type_of_target(y)
+        if y_type not in ['binary']:
+            raise ValueError('Label type must be binary')
+
+    def feature_discretion(self, X):
+        """
+        Discrete the continuous features of input data X, and keep other features unchanged.
+        :param X : numpy array
+        :return: the numpy array in which all continuous features are discreted
+        """
+        temp = []
+        for i in range(0, X.shape[-1]):
+            x = X[:, i]
+            x_type = type_of_target(x)
+            if x_type == 'continuous':
+                x1 = self.discrete(x)
+                temp.append(x1)
+            else:
+                temp.append(x)
+        return np.array(temp).T
+
+    def discrete(self, x):
+        """
+        Discrete the input 1-D numpy array using 5 equal percentiles
+        :param x: 1-D numpy array
+        :return: discreted 1-D numpy array
+        """
+        res = np.array([0] * x.shape[-1], dtype=int)
+        for i in range(5):
+            point1 = stats.scoreatpercentile(x, i * 20)
+            point2 = stats.scoreatpercentile(x, (i + 1) * 20)
+            x1 = x[np.where((x >= point1) & (x <= point2))]
+            mask = np.in1d(x, x1)
+            res[mask] = (i + 1)
+        return res
+
+    def woe_feature(self,x,dict):
+        new_x = []
+        for i in x:
+            new_x.append(dict[i])
+        return new_x
+
+    @property
+    def WOE_MIN(self):
+        return self._WOE_MIN
+
+    @WOE_MIN.setter
+    def WOE_MIN(self, woe_min):
+        self._WOE_MIN = woe_min
+
+    @property
+    def WOE_MAX(self):
+        return self._WOE_MAX
+
+    @WOE_MAX.setter
+    def WOE_MAX(self, woe_max):
+        self._WOE_MAX = woe_max
\ No newline at end of file
diff --git a/binning.py b/binning.py
index 7974076..c6a5d86 100644
--- a/binning.py
+++ b/binning.py
@@ -4,31 +4,31 @@
 import pandas as pd
 
 
-def equal_distance_binning(data, fea_name):
+def equal_distance_binning(df, fea_name):
     """
     等距分箱
-    :param data:
+    :param df:
     :param fea_name:
     :return:
     """
-    pass
+    df[fea_name + '_d'] = pd.cut(df[fea_name])
 
 
-def equal_frequency_binning(data, fea_name):
+def equal_frequency_binning(df, fea_name):
     """
     等频分箱
-    :param data:
+    :param df:
     :param fea_name:
     :return:
     """
-    pass
+    df[fea_name + '_f'] = pd.cut(df[fea_name])
 
 
-def auto_binning(data, fea_name):
+def auto_binning(df, fea_name):
     """
     自动分箱
-    :param data:
+    :param df:
     :param fea_name:
     :return:
     """
-    pass
+    df[fea_name + '_a'] = pd.cut(df[fea_name])

From fa55d5524a8ff1f990a853e8698e0bc421fc4236 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Thu, 21 Jun 2018 14:08:12 +0800
Subject: [PATCH 18/49] add WOE

---
 WOE.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/WOE.py b/WOE.py
index 5858e2b..d76a88a 100644
--- a/WOE.py
+++ b/WOE.py
@@ -85,7 +85,7 @@ def woe_replace(self, X, woe_arr):
 
     def combined_iv(self, X, y, masks, event=1):
         """
-        calcute the information vlaue of combination features
+        calcute the information value of combination features
         :param X: 2-D numpy array explanatory features which should be discreted already
         :param y: 1-D numpy array target variable
         :param masks: 1-D numpy array of masks stands for which features are included in combination,

From 63c5e0eeebddb7348481e5162b71a91cf50abd3f Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Fri, 22 Jun 2018 08:48:12 +0800
Subject: [PATCH 19/49] add WOE

---
 WOE.py => woe.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)
 rename WOE.py => woe.py (95%)

diff --git a/WOE.py b/woe.py
similarity index 95%
rename from WOE.py
rename to woe.py
index d76a88a..08e086b 100644
--- a/WOE.py
+++ b/woe.py
@@ -80,7 +80,6 @@ def woe_replace(self, X, woe_arr):
                 woe = woe_dict[k]
                 res[:, idx][np.where(res[:, idx] == k)[0]] = woe * 1.0
             idx += 1
-
         return res
 
     def combined_iv(self, X, y, masks, event=1):
@@ -179,4 +178,14 @@ def WOE_MAX(self):
 
     @WOE_MAX.setter
     def WOE_MAX(self, woe_max):
-        self._WOE_MAX = woe_max
\ No newline at end of file
+        self._WOE_MAX = woe_max
+
+if __name__ == '__main__':
+    # path=input('Please input the file path: ')
+    path = 'iris.csv'
+    raw_data = pd.read_csv(path)
+    # print(raw_data)
+    woe=WOE()
+    # woe_result=woe.woe_single_x(x=raw_data,'SepalLength')
+    ret=pd.cut(raw_data['SepalLength'],5)
+    print(ret)
\ No newline at end of file

From 6c57c8282c938461c9a902c4399242244e6c6f3a Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Fri, 22 Jun 2018 13:25:44 +0800
Subject: [PATCH 20/49] add WOE

---
 binning.py | 29 +++++++++++++++++++++++------
 main.py    | 15 +++++++++------
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/binning.py b/binning.py
index c6a5d86..593e16f 100644
--- a/binning.py
+++ b/binning.py
@@ -2,33 +2,50 @@
 __author__ = 'xujia'
 
 import pandas as pd
+import numpy as np
+from scipy import stats
 
 
-def equal_distance_binning(df, fea_name):
+def equal_distance_binning(df, fea_name, bin_count):
     """
     等距分箱
     :param df:
     :param fea_name:
+    :param bin_count
     :return:
     """
-    df[fea_name + '_d'] = pd.cut(df[fea_name])
+    df[fea_name + '_d'] = pd.cut(df[fea_name], bin_count)
 
 
-def equal_frequency_binning(df, fea_name):
+def equal_frequency_binning(df, fea_name, bin_count):
     """
     等频分箱
     :param df:
     :param fea_name:
+    :param bin_count
     :return:
     """
-    df[fea_name + '_f'] = pd.cut(df[fea_name])
+    df[fea_name + '_f'] = pd.cut(df[fea_name], bin_count)
 
 
-def auto_binning(df, fea_name):
+def auto_binning(df, target_name, feature_name, max_bin_count):
     """
     自动分箱
     :param df:
     :param fea_name:
+    :param bin_count
     :return:
     """
-    df[fea_name + '_a'] = pd.cut(df[fea_name])
+    r = 0
+    good = df[target_name].sum()
+    bad = df[target_name].count() - good
+    while np.abs(r) < 1:
+        d1 = pd.DataFrame({'X': df[feature_name], 'Y': df[target_name],
+                           'Bucket': pd.qcut(df[feature_name], max_bin_count, duplicates='drop')})
+        d2 = d1.groupby('Bucket', as_index=True)
+        r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
+        max_bin_count = max_bin_count - 1
+    woe = np.log((d2.mean().Y / (1 - d2.mean().Y)) / (good / bad))
+    woe_dict = woe.to_dict()
+    woe_values = sorted(list(woe_dict.values()))
+    df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x]).replace(np.inf, woe_values[-2])
diff --git a/main.py b/main.py
index f940fa7..81a8cad 100644
--- a/main.py
+++ b/main.py
@@ -3,6 +3,7 @@
 
 import pandas as pd
 import numpy as np
+import binning
 
 
 def file_info(file_path):
@@ -73,10 +74,12 @@ def split_data(data_to_split):
     # path=input('Please input the file path: ')
     path = 'iris.csv'
     fea_dict, data = file_info(path)
+    data = data.fillna(0.0)
+    # change_type(data, fea_dict)
+    # print(data.dtypes)
 
-    change_type(data, fea_dict)
-    print(data.dtypes)
-
-    t = split_data(data)
-    print(t[0].shape)
-    print(t[1].shape)
+    # t = split_data(data)
+    # print(t[0].shape)
+    # print(t[1].shape)
+    binning.auto_binning(data, 'Label', 'SepalLength', 10)
+    print(data)

From eda25554ae4373c8cd59a3556d040a28790db4b6 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Fri, 22 Jun 2018 14:28:21 +0800
Subject: [PATCH 21/49] add WOE

---
 ARUtil.py   | 14 +++++---------
 binning.py  |  5 ++++-
 main.py     |  4 ++++
 modeling.py | 21 +++++++++++++++++++++
 4 files changed, 34 insertions(+), 10 deletions(-)
 create mode 100644 modeling.py

diff --git a/ARUtil.py b/ARUtil.py
index d8e8df5..1cf4a9c 100755
--- a/ARUtil.py
+++ b/ARUtil.py
@@ -2,13 +2,10 @@
 import pandas as pd
 import numpy as np
 import logging
-# import sys
-# reload(sys)
-# sys.setdefaultencoding('utf8')
+from scipy import stats
 
 
 class ARFilter(object):
-
     def __init__(self, threshold=0.05, dest_var='y'):
         self.threshold = threshold
         self.dest_var = dest_var
@@ -96,7 +93,7 @@ def cal_ks(self, data):
                 bad_count += 1.0
             else:
                 good_count += 1.0
-            val = abs(bad_count/total_bad_count - good_count/total_good_count)
+            val = abs(bad_count / total_bad_count - good_count / total_good_count)
             max_ks = max(max_ks, val)
         return max_ks
 
@@ -154,7 +151,7 @@ def del_empty_value(self, data, empty_rate_threshold=0.5):
         for col in data.columns.values:
             if col == 'y':
                 continue
-            empty_ratio = (data[col].shape[0] - data[col].count())/data[col].shape[0]
+            empty_ratio = (data[col].shape[0] - data[col].count()) / data[col].shape[0]
             if empty_ratio >= empty_rate_threshold:
                 self.logger.info("变量：" + col + "缺失率为" + str(empty_ratio) + ",高于阈值：" + str(empty_rate_threshold))
                 data = data.drop(col, axis=1)
@@ -204,7 +201,8 @@ def main(self):
             is_contain_empty_value, empty_col_list = self.is_contain_empty_value(file_dict)
             if is_contain_empty_value:
                 self.logger.info("当前存在缺失值")
-                is_fill_empty = self.console_input(prompt="是否需要填充数据？1：是，其他值：否", if_value=["1"], else_value=[],if_rtn=True, else_rtn=False)
+                is_fill_empty = self.console_input(prompt="是否需要填充数据？1：是，其他值：否", if_value=["1"], else_value=[],
+                                                   if_rtn=True, else_rtn=False)
                 if is_fill_empty:
                     for col in empty_col_list:
                         fill_value = input("请输入列" + col + "待填充的数据：")
@@ -228,6 +226,4 @@ def run():
 
 
 if __name__ == "__main__":
-
     run()
-
diff --git a/binning.py b/binning.py
index 593e16f..f0c4675 100644
--- a/binning.py
+++ b/binning.py
@@ -48,4 +48,7 @@ def auto_binning(df, target_name, feature_name, max_bin_count):
     woe = np.log((d2.mean().Y / (1 - d2.mean().Y)) / (good / bad))
     woe_dict = woe.to_dict()
     woe_values = sorted(list(woe_dict.values()))
-    df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x]).replace(np.inf, woe_values[-2])
+    print(woe_values)
+    # 如果存在woe为inf情况，将其替换为不为inf的最大值加一
+    df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x]).replace(np.inf, woe_values[-2] + 1).replace(
+        -np.inf, woe_values[1] - 1)
diff --git a/main.py b/main.py
index 81a8cad..7097255 100644
--- a/main.py
+++ b/main.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import numpy as np
 import binning
+import modeling
 
 
 def file_info(file_path):
@@ -82,4 +83,7 @@ def split_data(data_to_split):
     # print(t[0].shape)
     # print(t[1].shape)
     binning.auto_binning(data, 'Label', 'SepalLength', 10)
+    binning.auto_binning(data, 'Label', 'PetalLength', 10)
+    binning.auto_binning(data, 'Label', 'PetalWidth', 10)
     print(data)
+    print(modeling.model(data,['SepalLength_woe','PetalLength_woe','PetalWidth_woe'],'Label'))
diff --git a/modeling.py b/modeling.py
new file mode 100644
index 0000000..a185bf1
--- /dev/null
+++ b/modeling.py
@@ -0,0 +1,21 @@
+# -*- coding:utf-8 -*-
+__author__ = 'xujia'
+import numpy as np
+
+from sklearn.linear_model import LogisticRegression
+
+
+def model(data, fea_list, target):
+    cls = LogisticRegression()
+    cls.fit(data[fea_list], data[target])
+    print(cls.coef_)
+    print(cls.intercept_)
+    return cls
+
+
+def score_trans(data, coef, intercept, scaled_value, odds, pdo):
+    a = (np.log(2 * odds) - np.log(odds)) / pdo
+    b = np.log(odds, np.e) - a * scaled_value
+    p = intercept + coef.dot(data)
+    score = np.log(p / (1 - p)) * a + b
+    return score

From ebcc7379d020fbd28fa8da9fc37e931825e6f26c Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Fri, 22 Jun 2018 16:23:09 +0800
Subject: [PATCH 22/49] add WOE

---
 main.py     | 10 ++++++----
 modeling.py | 13 ++++++-------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/main.py b/main.py
index 7097255..ea946bf 100644
--- a/main.py
+++ b/main.py
@@ -61,7 +61,8 @@ def split_data(data_to_split):
     :param data_to_split:带分割数据
     :return: （数据集1，数据集2）
     """
-    ratio = float(input('请输入数据分割比例：'))
+    # ratio = float(input('请输入数据分割比例：'))
+    ratio = 0.8
     data_count = data_to_split.shape[0]
     selected_count = int(data_count * ratio)
     if selected_count > 0:
@@ -79,11 +80,12 @@ def split_data(data_to_split):
     # change_type(data, fea_dict)
     # print(data.dtypes)
 
-    # t = split_data(data)
     # print(t[0].shape)
     # print(t[1].shape)
     binning.auto_binning(data, 'Label', 'SepalLength', 10)
     binning.auto_binning(data, 'Label', 'PetalLength', 10)
     binning.auto_binning(data, 'Label', 'PetalWidth', 10)
-    print(data)
-    print(modeling.model(data,['SepalLength_woe','PetalLength_woe','PetalWidth_woe'],'Label'))
+    data1, data2 = split_data(data)
+    model = modeling.model(data1, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
+    predict_score = modeling.score_trans(data2[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5,100, 10)
+    print(list(zip(data2['Label'].values, predict_score)))
diff --git a/modeling.py b/modeling.py
index a185bf1..59434e4 100644
--- a/modeling.py
+++ b/modeling.py
@@ -8,14 +8,13 @@
 def model(data, fea_list, target):
     cls = LogisticRegression()
     cls.fit(data[fea_list], data[target])
-    print(cls.coef_)
-    print(cls.intercept_)
     return cls
 
 
-def score_trans(data, coef, intercept, scaled_value, odds, pdo):
-    a = (np.log(2 * odds) - np.log(odds)) / pdo
-    b = np.log(odds, np.e) - a * scaled_value
-    p = intercept + coef.dot(data)
-    score = np.log(p / (1 - p)) * a + b
+def score_trans(data, model, p, scaled_value, pdo):
+    b = pdo / np.log(2)
+    a = scaled_value + b * np.log(p)
+    p = model.predict_proba(data)[:, 1]
+    score = a - np.log(p / (1 - p)) * b
+
     return score

From 10295547c13567d58986ddc24595d8fd581eeb77 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Fri, 22 Jun 2018 17:30:01 +0800
Subject: [PATCH 23/49] add WOE

---
 binning.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/binning.py b/binning.py
index f0c4675..4c6144c 100644
--- a/binning.py
+++ b/binning.py
@@ -32,15 +32,17 @@ def auto_binning(df, target_name, feature_name, max_bin_count):
     """
     自动分箱
     :param df:
-    :param fea_name:
-    :param bin_count
+    :param target_name: 目标变量名
+    :param feature_name:特征变量名称
+    :param max_bin_count:最大分箱数
     :return:
     """
     r = 0
     good = df[target_name].sum()
     bad = df[target_name].count() - good
     while np.abs(r) < 1:
-        d1 = pd.DataFrame({'X': df[feature_name], 'Y': df[target_name],
+        d1 = pd.DataFrame({'X': df[feature_name],
+                           'Y': df[target_name],
                            'Bucket': pd.qcut(df[feature_name], max_bin_count, duplicates='drop')})
         d2 = d1.groupby('Bucket', as_index=True)
         r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
@@ -50,5 +52,7 @@ def auto_binning(df, target_name, feature_name, max_bin_count):
     woe_values = sorted(list(woe_dict.values()))
     print(woe_values)
     # 如果存在woe为inf情况，将其替换为不为inf的最大值加一
-    df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x]).replace(np.inf, woe_values[-2] + 1).replace(
-        -np.inf, woe_values[1] - 1)
+    df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x])\
+                                            .replace(np.inf, woe_values[-2] + 1)\
+                                            .replace(-np.inf, woe_values[1] - 1)
+    # return woe_dict

From 08f54d5a2f79ce770345abefcde6ae93c55c3dfe Mon Sep 17 00:00:00 2001
From: GiantTao <wtctc@126.com>
Date: Mon, 25 Jun 2018 14:02:30 +0800
Subject: [PATCH 24/49] ROC

---
 evaluate.py | 21 +++++++++++++++++++++
 main.py     | 13 +++++++++----
 2 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 evaluate.py

diff --git a/evaluate.py b/evaluate.py
new file mode 100644
index 0000000..a00afe6
--- /dev/null
+++ b/evaluate.py
@@ -0,0 +1,21 @@
+from sklearn  import metrics
+import matplotlib.pyplot as plt
+
+
+def auc(model, test_data, fea_list, target):
+    predict_value = model.predict_proba(test_data[fea_list])[:, 1]
+    return metrics.roc_auc_score(test_data[target], predict_value)
+
+
+def roc(model, test_data, fea_list, target):
+    predict_value = model.predict_proba(test_data[fea_list])[:, 1]
+    fpr, tpr, thresholds = metrics.roc_curve(test_data[target], predict_value)
+    roc_auc = metrics.auc(fpr, tpr)
+    plt.figure()
+    plt.plot(fpr, tpr, label='data1, AUC = %0.2f' % roc_auc)
+    plt.legend(loc=4)
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title("ROC Diagram")
+    plt.show()
+
diff --git a/main.py b/main.py
index ea946bf..ae34431 100644
--- a/main.py
+++ b/main.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import numpy as np
 import binning
+import evaluate
 import modeling
 
 
@@ -85,7 +86,11 @@ def split_data(data_to_split):
     binning.auto_binning(data, 'Label', 'SepalLength', 10)
     binning.auto_binning(data, 'Label', 'PetalLength', 10)
     binning.auto_binning(data, 'Label', 'PetalWidth', 10)
-    data1, data2 = split_data(data)
-    model = modeling.model(data1, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
-    predict_score = modeling.score_trans(data2[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5,100, 10)
-    print(list(zip(data2['Label'].values, predict_score)))
+    train_data, test_data = split_data(data)
+    model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
+    predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5, 100, 10)
+    print(list(zip(test_data['Label'].values, predict_score)))
+
+    auc = evaluate.auc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
+    print("auc: " + str(auc))
+    evaluate.roc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')

From 638334c4c99572d2e33412c641841dda4727b84c Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Mon, 25 Jun 2018 14:08:02 +0800
Subject: [PATCH 25/49] Merge branch 'master' of
 /Users/yuguanghui/Documents/GitHub/ScoreCard with conflicts.

---
 main.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/main.py b/main.py
index ae34431..9a6e356 100644
--- a/main.py
+++ b/main.py
@@ -4,7 +4,6 @@
 import pandas as pd
 import numpy as np
 import binning
-import evaluate
 import modeling
 
 
@@ -86,11 +85,7 @@ def split_data(data_to_split):
     binning.auto_binning(data, 'Label', 'SepalLength', 10)
     binning.auto_binning(data, 'Label', 'PetalLength', 10)
     binning.auto_binning(data, 'Label', 'PetalWidth', 10)
-    train_data, test_data = split_data(data)
-    model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
-    predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5, 100, 10)
-    print(list(zip(test_data['Label'].values, predict_score)))
-
-    auc = evaluate.auc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
-    print("auc: " + str(auc))
-    evaluate.roc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
+    data1, data2 = split_data(data)
+    model = modeling.model(data1, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
+    predict_score = modeling.score_trans(data2[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5, 100, 10)
+    print(list(zip(data2['Label'].values, predict_score)))

From eaf928ede03656e0001225a3d7a12d53dc1d4a28 Mon Sep 17 00:00:00 2001
From: GiantTao <wtctc@126.com>
Date: Mon, 25 Jun 2018 14:11:30 +0800
Subject: [PATCH 26/49] ROC

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index ae34431..98dd6b7 100644
--- a/main.py
+++ b/main.py
@@ -92,5 +92,5 @@ def split_data(data_to_split):
     print(list(zip(test_data['Label'].values, predict_score)))
 
     auc = evaluate.auc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
-    print("auc: " + str(auc))
+    print("au值: " + str(auc))
     evaluate.roc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')

From 59de5e796c1f8d41741bfa262f4ccb4015951d0b Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Mon, 25 Jun 2018 14:49:58 +0800
Subject: [PATCH 27/49] add WOE

---
 binning.py |  59 ++++++++++++++--
 iris.csv   | 200 ++++++++++++++++++++++++++---------------------------
 2 files changed, 155 insertions(+), 104 deletions(-)

diff --git a/binning.py b/binning.py
index 4c6144c..c38936c 100644
--- a/binning.py
+++ b/binning.py
@@ -52,7 +52,58 @@ def auto_binning(df, target_name, feature_name, max_bin_count):
     woe_values = sorted(list(woe_dict.values()))
     print(woe_values)
     # 如果存在woe为inf情况，将其替换为不为inf的最大值加一
-    df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x])\
-                                            .replace(np.inf, woe_values[-2] + 1)\
-                                            .replace(-np.inf, woe_values[1] - 1)
-    # return woe_dict
+    df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x]) \
+        .replace(np.inf, woe_values[-2] + 1) \
+        .replace(-np.inf, woe_values[1] - 1)
+
+
+def chi2(A):
+    ''' Compute the Chi-Square value '''
+    m, k = A.shape  # 行数 列数
+
+    R = A.sum(axis=1)  # 行求和结果
+    C = A.sum(axis=0)  # 列求和结果
+    N = A.sum()  # 总和
+
+    res = 0
+    for i in range(m):
+        for j in range(k):
+            Eij = 1.0 * R[i] * C[j] / N
+            if Eij != 0:
+                res = 1.0 * res + (A[i][j] - Eij) ** 2 / Eij
+    return res
+
+
+def chi_merge(df, fea_name, target_name, dis_count):
+    fea_count = df[[fea_name, target_name]].copy().groupby([fea_name, target_name]).size().unstack().fillna(0.0)
+    while fea_count.shape[0] > dis_count:
+        chi_list = []
+        for i in range(fea_count.shape[0] - 1):
+            chi_value = chi2(fea_count.iloc[i:i + 2].values)
+            chi_list.append([fea_count.index[i], chi_value])
+
+        chi_min_index = np.argmin(np.array(chi_list)[:, 1])
+        if chi_min_index == len(chi_list) - 1:
+            current_fea = chi_list[chi_min_index][0]
+            fea_count.loc[current_fea] = fea_count.loc[current_fea:].sum(axis=0)
+            fea_count = fea_count.loc[:current_fea].copy()
+        else:
+            current_fea = chi_list[chi_min_index][0]
+            next_fea = chi_list[chi_min_index + 1][0]
+            fea_count.loc[current_fea] = fea_count.loc[current_fea] + fea_count.loc[next_fea]
+            fea_count.drop([next_fea], inplace=True)
+            chi_list.remove(chi_list[chi_min_index + 1])
+    print(fea_count)
+
+
+def discrete(path):
+    df = pd.read_csv(path)
+    target_name = df.columns[-1]
+    fea_names = df.columns[0:-1]
+    dis_count = 2
+    for f in fea_names:
+        chi_merge(df, f, target_name, dis_count)
+
+
+if __name__ == '__main__':
+    discrete('iris.csv')
diff --git a/iris.csv b/iris.csv
index a320fb3..517507e 100644
--- a/iris.csv
+++ b/iris.csv
@@ -1,101 +1,101 @@
 SepalLength,SepalWidth,PetalLength,PetalWidth,Label
-5.1,3.5,,0.2,0
-4.9,3.0,,0.2,0
-4.7,3.2,,0.2,0
-4.6,3.1,,0.2,0
-5.0,3.6,,0.2,0
-5.4,3.9,,0.4,0
-4.6,3.4,,0.3,0
-5.0,3.4,1.5,0.2,0
-4.4,2.9,1.4,0.2,0
-4.9,3.1,1.5,0.1,0
-5.4,3.7,1.5,0.2,0
-4.8,3.4,1.6,0.2,0
-4.8,3.0,1.4,0.1,0
-4.3,,1.1,0.1,0
-5.8,4.0,1.2,0.2,0
-5.7,4.4,1.5,0.4,0
-5.4,3.9,1.3,0.4,0
-5.1,3.5,1.4,0.3,0
-5.7,3.8,1.7,0.3,0
-5.1,3.8,1.5,0.3,0
-5.4,3.4,1.7,0.2,0
-5.1,3.7,1.5,0.4,0
-4.6,3.6,1.0,0.2,0
-5.1,3.3,1.7,0.5,0
-4.8,3.4,1.9,0.2,0
-5.0,3.0,1.6,0.2,0
-5.0,3.4,1.6,0.4,0
-5.2,3.5,1.5,0.2,0
-5.2,3.4,1.4,0.2,0
-4.7,3.2,1.6,0.2,0
-4.8,3.1,1.6,0.2,0
-5.4,3.4,1.5,0.4,0
-5.2,4.1,1.5,0.1,0
-5.5,4.2,1.4,0.2,0
-4.9,3.1,1.5,0.1,0
-5.0,3.2,1.2,0.2,0
-5.5,3.5,1.3,0.2,0
-4.9,3.1,1.5,0.1,0
-4.4,3.0,1.3,0.2,0
-5.1,3.4,1.5,0.2,0
-5.0,3.5,1.3,0.3,0
-4.5,2.3,1.3,0.3,0
-4.4,3.2,1.3,0.2,0
-5.0,3.5,1.6,0.6,0
-5.1,3.8,1.9,0.4,0
-4.8,3.0,1.4,0.3,0
-5.1,3.8,1.6,0.2,0
-4.6,3.2,1.4,0.2,0
-5.3,3.7,1.5,0.2,0
-5.0,3.3,1.4,0.2,0
-7.0,3.2,4.7,1.4,1
-6.4,3.2,4.5,1.5,1
-6.9,3.1,4.9,1.5,1
-5.5,2.3,4.0,1.3,1
-6.5,2.8,4.6,1.5,1
-5.7,2.8,4.5,1.3,1
-6.3,3.3,4.7,1.6,1
-4.9,2.4,3.3,1.0,1
-6.6,2.9,4.6,1.3,1
-5.2,2.7,3.9,1.4,1
-5.0,2.0,3.5,1.0,1
-5.9,3.0,4.2,1.5,1
-6.0,2.2,4.0,1.0,1
-6.1,2.9,4.7,1.4,1
-5.6,2.9,3.6,1.3,1
-6.7,3.1,4.4,1.4,1
-5.6,3.0,4.5,1.5,1
-5.8,2.7,4.1,1.0,1
-6.2,2.2,4.5,1.5,1
-5.6,2.5,3.9,1.1,1
-5.9,3.2,4.8,1.8,1
-6.1,2.8,4.0,1.3,1
-6.3,2.5,4.9,1.5,1
-6.1,2.8,4.7,1.2,1
-6.4,2.9,4.3,1.3,1
-6.6,3.0,4.4,1.4,1
-6.8,2.8,4.8,1.4,1
-6.7,3.0,5.0,1.7,1
-6.0,2.9,4.5,1.5,1
-5.7,2.6,3.5,1.0,1
-5.5,2.4,3.8,1.1,1
-5.5,2.4,3.7,1.0,1
-5.8,2.7,3.9,1.2,1
-6.0,2.7,5.1,1.6,1
-5.4,3.0,4.5,1.5,1
-6.0,3.4,4.5,1.6,1
-6.7,3.1,4.7,1.5,1
-6.3,2.3,4.4,1.3,1
-5.6,3.0,4.1,1.3,1
-5.5,2.5,4.0,1.3,1
-5.5,2.6,4.4,1.2,1
-6.1,3.0,4.6,1.4,1
-5.8,2.6,4.0,1.2,1
-5.0,2.3,3.3,1.0,1
-5.6,2.7,4.2,1.3,1
-5.7,3.0,4.2,1.2,1
-5.7,2.9,4.2,1.3,1
-6.2,2.9,4.3,1.3,1
-5.1,2.5,3.0,1.1,1
-5.7,2.8,4.1,1.3,1
\ No newline at end of file
+5.1,3.5,1.4,0.2,0.0
+4.9,3.0,1.4,0.2,0.0
+4.7,3.2,1.3,0.2,0.0
+4.6,3.1,1.5,0.2,0.0
+5.0,3.6,1.4,0.2,0.0
+5.4,3.9,1.7,0.4,0.0
+4.6,3.4,1.4,0.3,0.0
+5.0,3.4,1.5,0.2,0.0
+4.4,2.9,1.4,0.2,0.0
+4.9,3.1,1.5,0.1,0.0
+5.4,3.7,1.5,0.2,0.0
+4.8,3.4,1.6,0.2,0.0
+4.8,3.0,1.4,0.1,0.0
+4.3,3.0,1.1,0.1,0.0
+5.8,4.0,1.2,0.2,0.0
+5.7,4.4,1.5,0.4,0.0
+5.4,3.9,1.3,0.4,0.0
+5.1,3.5,1.4,0.3,0.0
+5.7,3.8,1.7,0.3,0.0
+5.1,3.8,1.5,0.3,0.0
+5.4,3.4,1.7,0.2,0.0
+5.1,3.7,1.5,0.4,0.0
+4.6,3.6,1.0,0.2,0.0
+5.1,3.3,1.7,0.5,0.0
+4.8,3.4,1.9,0.2,0.0
+5.0,3.0,1.6,0.2,0.0
+5.0,3.4,1.6,0.4,0.0
+5.2,3.5,1.5,0.2,0.0
+5.2,3.4,1.4,0.2,0.0
+4.7,3.2,1.6,0.2,0.0
+4.8,3.1,1.6,0.2,0.0
+5.4,3.4,1.5,0.4,0.0
+5.2,4.1,1.5,0.1,0.0
+5.5,4.2,1.4,0.2,0.0
+4.9,3.1,1.5,0.1,0.0
+5.0,3.2,1.2,0.2,0.0
+5.5,3.5,1.3,0.2,0.0
+4.9,3.1,1.5,0.1,0.0
+4.4,3.0,1.3,0.2,0.0
+5.1,3.4,1.5,0.2,0.0
+5.0,3.5,1.3,0.3,0.0
+4.5,2.3,1.3,0.3,0.0
+4.4,3.2,1.3,0.2,0.0
+5.0,3.5,1.6,0.6,0.0
+5.1,3.8,1.9,0.4,0.0
+4.8,3.0,1.4,0.3,0.0
+5.1,3.8,1.6,0.2,0.0
+4.6,3.2,1.4,0.2,0.0
+5.3,3.7,1.5,0.2,0.0
+5.0,3.3,1.4,0.2,0.0
+7.0,3.2,4.7,1.4,1.0
+6.4,3.2,4.5,1.5,1.0
+6.9,3.1,4.9,1.5,1.0
+5.5,2.3,4.0,1.3,1.0
+6.5,2.8,4.6,1.5,1.0
+5.7,2.8,4.5,1.3,1.0
+6.3,3.3,4.7,1.6,1.0
+4.9,2.4,3.3,1.0,1.0
+6.6,2.9,4.6,1.3,1.0
+5.2,2.7,3.9,1.4,1.0
+5.0,2.0,3.5,1.0,1.0
+5.9,3.0,4.2,1.5,1.0
+6.0,2.2,4.0,1.0,1.0
+6.1,2.9,4.7,1.4,1.0
+5.6,2.9,3.6,1.3,1.0
+6.7,3.1,4.4,1.4,1.0
+5.6,3.0,4.5,1.5,1.0
+5.8,2.7,4.1,1.0,1.0
+6.2,2.2,4.5,1.5,1.0
+5.6,2.5,3.9,1.1,1.0
+5.9,3.2,4.8,1.8,1.0
+6.1,2.8,4.0,1.3,1.0
+6.3,2.5,4.9,1.5,1.0
+6.1,2.8,4.7,1.2,1.0
+6.4,2.9,4.3,1.3,1.0
+6.6,3.0,4.4,1.4,1.0
+6.8,2.8,4.8,1.4,1.0
+6.7,3.0,5.0,1.7,1.0
+6.0,2.9,4.5,1.5,1.0
+5.7,2.6,3.5,1.0,1.0
+5.5,2.4,3.8,1.1,1.0
+5.5,2.4,3.7,1.0,1.0
+5.8,2.7,3.9,1.2,1.0
+6.0,2.7,5.1,1.6,1.0
+5.4,3.0,4.5,1.5,1.0
+6.0,3.4,4.5,1.6,1.0
+6.7,3.1,4.7,1.5,1.0
+6.3,2.3,4.4,1.3,1.0
+5.6,3.0,4.1,1.3,1.0
+5.5,2.5,4.0,1.3,1.0
+5.5,2.6,4.4,1.2,1.0
+6.1,3.0,4.6,1.4,1.0
+5.8,2.6,4.0,1.2,1.0
+5.0,2.3,3.3,1.0,1.0
+5.6,2.7,4.2,1.3,1.0
+5.7,3.0,4.2,1.2,1.0
+5.7,2.9,4.2,1.3,1.0
+6.2,2.9,4.3,1.3,1.0
+5.1,2.5,3.0,1.1,1.0
+5.7,2.8,4.1,1.3,1.0
\ No newline at end of file

From 49956803f0514808b363361b06fa8b2cac5e316d Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Mon, 25 Jun 2018 16:12:57 +0800
Subject: [PATCH 28/49] add WOE

---
 evaluate.py | 33 +++++++++++++++++++++++++--------
 main.py     |  7 ++++---
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index a00afe6..575f24f 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -1,15 +1,33 @@
-from sklearn  import metrics
+# -*-coding:utf-8 -*-
+
+from sklearn import metrics
 import matplotlib.pyplot as plt
 
 
-def auc(model, test_data, fea_list, target):
-    predict_value = model.predict_proba(test_data[fea_list])[:, 1]
-    return metrics.roc_auc_score(test_data[target], predict_value)
+def auc(model, test_data):
+    """
+
+    :param model:
+    :param test_data:
+    :param fea_list:
+    :param target:
+    :return:
+    """
+    predict_value = model.predict_proba(test_data.ix[:,0:-1])[:, 1]
+    return metrics.roc_auc_score(test_data.ix[:,-1], predict_value)
 
 
-def roc(model, test_data, fea_list, target):
-    predict_value = model.predict_proba(test_data[fea_list])[:, 1]
-    fpr, tpr, thresholds = metrics.roc_curve(test_data[target], predict_value)
+def roc(model, test_data):
+    """
+
+    :param model:
+    :param test_data:
+    :param fea_list:
+    :param target:
+    :return:
+    """
+    predict_value = model.predict_proba(test_data.ix[:,0:-1])[:, 1]
+    fpr, tpr, thresholds = metrics.roc_curve(test_data.ix[:,-1], predict_value)
     roc_auc = metrics.auc(fpr, tpr)
     plt.figure()
     plt.plot(fpr, tpr, label='data1, AUC = %0.2f' % roc_auc)
@@ -18,4 +36,3 @@ def roc(model, test_data, fea_list, target):
     plt.ylabel("True Positive Rate")
     plt.title("ROC Diagram")
     plt.show()
-
diff --git a/main.py b/main.py
index 98dd6b7..fa40647 100644
--- a/main.py
+++ b/main.py
@@ -88,9 +88,10 @@ def split_data(data_to_split):
     binning.auto_binning(data, 'Label', 'PetalWidth', 10)
     train_data, test_data = split_data(data)
     model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
-    predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5, 100, 10)
+    predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model,
+                                         0.5, 100, 10)
     print(list(zip(test_data['Label'].values, predict_score)))
 
-    auc = evaluate.auc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
+    auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
     print("au值: " + str(auc))
-    evaluate.roc(model, test_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
+    evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])

From 479481fb7fc564ad7bbbd41f16a5f0dd44871afa Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Tue, 26 Jun 2018 18:37:28 +0800
Subject: [PATCH 29/49] add WOE

---
 evaluate.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index 575f24f..0c84e84 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -13,8 +13,8 @@ def auc(model, test_data):
     :param target:
     :return:
     """
-    predict_value = model.predict_proba(test_data.ix[:,0:-1])[:, 1]
-    return metrics.roc_auc_score(test_data.ix[:,-1], predict_value)
+    predict_value = model.predict_proba(test_data.ix[:, 0:-1])[:, 1]
+    return metrics.roc_auc_score(test_data.ix[:, -1], predict_value)
 
 
 def roc(model, test_data):
@@ -26,8 +26,8 @@ def roc(model, test_data):
     :param target:
     :return:
     """
-    predict_value = model.predict_proba(test_data.ix[:,0:-1])[:, 1]
-    fpr, tpr, thresholds = metrics.roc_curve(test_data.ix[:,-1], predict_value)
+    predict_value = model.predict_proba(test_data.ix[:, 0:-1])[:, 1]
+    fpr, tpr, thresholds = metrics.roc_curve(test_data.ix[:, -1], predict_value)
     roc_auc = metrics.auc(fpr, tpr)
     plt.figure()
     plt.plot(fpr, tpr, label='data1, AUC = %0.2f' % roc_auc)

From f9cf842e3035e2f9af4100682f4d3db40fc29fbd Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Thu, 28 Jun 2018 08:56:55 +0800
Subject: [PATCH 30/49] add WOE

---
 binning.py  | 17 +++++++++++++++--
 evaluate.py | 16 ++++++----------
 main.py     | 41 ++++++++++++++++++++++++++++++-----------
 3 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/binning.py b/binning.py
index c38936c..a83f09a 100644
--- a/binning.py
+++ b/binning.py
@@ -28,7 +28,7 @@ def equal_frequency_binning(df, fea_name, bin_count):
     df[fea_name + '_f'] = pd.cut(df[fea_name], bin_count)
 
 
-def auto_binning(df, target_name, feature_name, max_bin_count):
+def auto_binning(df, feature_name, target_name, max_bin_count):
     """
     自动分箱
     :param df:
@@ -58,7 +58,11 @@ def auto_binning(df, target_name, feature_name, max_bin_count):
 
 
 def chi2(A):
-    ''' Compute the Chi-Square value '''
+    """
+    计算卡方值
+    :param A:需要计算卡方的两行数据
+    :return: 卡方值
+    """
     m, k = A.shape  # 行数 列数
 
     R = A.sum(axis=1)  # 行求和结果
@@ -75,6 +79,14 @@ def chi2(A):
 
 
 def chi_merge(df, fea_name, target_name, dis_count):
+    """
+    chiMerge的主算法
+    :param df:数据，dataframe格式
+    :param fea_name:需要进行分段的特征名称
+    :param target_name:目标变量名称
+    :param dis_count:最大分组数
+    :return: 分割点
+    """
     fea_count = df[[fea_name, target_name]].copy().groupby([fea_name, target_name]).size().unstack().fillna(0.0)
     while fea_count.shape[0] > dis_count:
         chi_list = []
@@ -94,6 +106,7 @@ def chi_merge(df, fea_name, target_name, dis_count):
             fea_count.drop([next_fea], inplace=True)
             chi_list.remove(chi_list[chi_min_index + 1])
     print(fea_count)
+    return fea_count
 
 
 def discrete(path):
diff --git a/evaluate.py b/evaluate.py
index 0c84e84..9b96214 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -7,11 +7,9 @@
 def auc(model, test_data):
     """
 
-    :param model:
-    :param test_data:
-    :param fea_list:
-    :param target:
-    :return:
+    :param model:模型
+    :param test_data:测试数据，dataframe格式，第一列至倒数第二列为特征字段，最后一列为目标字段
+    :return:auc值
     """
     predict_value = model.predict_proba(test_data.ix[:, 0:-1])[:, 1]
     return metrics.roc_auc_score(test_data.ix[:, -1], predict_value)
@@ -20,11 +18,9 @@ def auc(model, test_data):
 def roc(model, test_data):
     """
 
-    :param model:
-    :param test_data:
-    :param fea_list:
-    :param target:
-    :return:
+    :param model:模型
+    :param test_data:测试数据，dataframe格式，第一列至倒数第二列为特征字段，最后一列为目标字段
+    :return:roc曲线
     """
     predict_value = model.predict_proba(test_data.ix[:, 0:-1])[:, 1]
     fpr, tpr, thresholds = metrics.roc_curve(test_data.ix[:, -1], predict_value)
diff --git a/main.py b/main.py
index fa40647..739fa2b 100644
--- a/main.py
+++ b/main.py
@@ -6,6 +6,8 @@
 import binning
 import evaluate
 import modeling
+import woe
+import math
 
 
 def file_info(file_path):
@@ -83,15 +85,32 @@ def split_data(data_to_split):
 
     # print(t[0].shape)
     # print(t[1].shape)
-    binning.auto_binning(data, 'Label', 'SepalLength', 10)
-    binning.auto_binning(data, 'Label', 'PetalLength', 10)
-    binning.auto_binning(data, 'Label', 'PetalWidth', 10)
-    train_data, test_data = split_data(data)
-    model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
-    predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model,
-                                         0.5, 100, 10)
-    print(list(zip(test_data['Label'].values, predict_score)))
+    # binning.auto_binning(data, 'SepalLength','Label', 10)
+    # binning.auto_binning(data, 'PetalLength','Label', 10)
+    # binning.auto_binning(data, 'PetalWidth','Label',  10)
+    # train_data, test_data = split_data(data)
+    # model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
+    # predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model,
+    #                                      0.5, 100, 10)
+    # print(list(zip(test_data['Label'].values, predict_score)))
+    #
+    # auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
+    # print("au值: " + str(auc))
+    # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
 
-    auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
-    print("au值: " + str(auc))
-    evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
+    bins = binning.chi_merge(data, 'SepalLength', 'Label', 5)
+    bin_index = bins.index.values.astype(float).copy()
+    bin_index[0] = -np.inf
+    bin_index = np.append(bin_index, np.inf)
+    interval_list = []
+    woe_list = []
+    for i in range(len(bin_index) - 1):
+        if bin_index[i] == bin_index[i + 1]:
+            continue
+        else:
+            interval_list.append(pd.Interval(left=bin_index[i], right=bin_index[i + 1], closed='left'))
+            woe_list.append(
+                math.log((bins[0.0][bin_index[i]] / bins[0.0].sum()) / (bins[1.0][bin_index[i]] / bins[1.0].sum())))
+    print(interval_list, woe_list)
+
+    print(bins)

From c79eb159dadc6ed5a9b8d2a94c6415d00d7207b6 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Thu, 28 Jun 2018 08:57:19 +0800
Subject: [PATCH 31/49] add WOE

---
 main.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/main.py b/main.py
index 739fa2b..8238aec 100644
--- a/main.py
+++ b/main.py
@@ -112,5 +112,4 @@ def split_data(data_to_split):
             woe_list.append(
                 math.log((bins[0.0][bin_index[i]] / bins[0.0].sum()) / (bins[1.0][bin_index[i]] / bins[1.0].sum())))
     print(interval_list, woe_list)
-
     print(bins)

From 3e4b5ccac4178fd9b3450f416b238cf1332ba7b4 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Thu, 28 Jun 2018 09:35:59 +0800
Subject: [PATCH 32/49] add WOE

---
 main.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/main.py b/main.py
index 8238aec..3df2695 100644
--- a/main.py
+++ b/main.py
@@ -99,17 +99,27 @@ def split_data(data_to_split):
     # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
 
     bins = binning.chi_merge(data, 'SepalLength', 'Label', 5)
-    bin_index = bins.index.values.astype(float).copy()
-    bin_index[0] = -np.inf
+    bin_index = bins.index.values.astype(float)
+    # bin_index[0] = -np.inf
     bin_index = np.append(bin_index, np.inf)
     interval_list = []
     woe_list = []
+    max_woe = 20
+    min_woe = -20
     for i in range(len(bin_index) - 1):
         if bin_index[i] == bin_index[i + 1]:
             continue
         else:
             interval_list.append(pd.Interval(left=bin_index[i], right=bin_index[i + 1], closed='left'))
-            woe_list.append(
-                math.log((bins[0.0][bin_index[i]] / bins[0.0].sum()) / (bins[1.0][bin_index[i]] / bins[1.0].sum())))
-    print(interval_list, woe_list)
+            rate_event = bins[0.0][bin_index[i]] / bins[0.0].sum()
+            rate_non_event = bins[1.0][bin_index[i]] / bins[1.0].sum()
+            if rate_event == 0.0:
+                woe_list.append(min_woe)
+            elif rate_non_event == 0.0:
+                woe_list.append(max_woe)
+            else:
+                woe_list.append(
+                    math.log((bins[0.0][bin_index[i]] / bins[0.0].sum()) / (bins[1.0][bin_index[i]] / bins[1.0].sum())))
+    bins['interval'] = interval_list
+    bins['woe'] = woe_list
     print(bins)

From fd41b88dbdcbc9813698180ab22b793bc45abc90 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Thu, 28 Jun 2018 09:46:43 +0800
Subject: [PATCH 33/49] add WOE

---
 main.py | 26 +-------------------------
 woe.py  | 35 +++++++++++++++++++++++++++++++----
 2 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/main.py b/main.py
index 3df2695..6b87dca 100644
--- a/main.py
+++ b/main.py
@@ -97,29 +97,5 @@ def split_data(data_to_split):
     # auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
     # print("au值: " + str(auc))
     # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
-
     bins = binning.chi_merge(data, 'SepalLength', 'Label', 5)
-    bin_index = bins.index.values.astype(float)
-    # bin_index[0] = -np.inf
-    bin_index = np.append(bin_index, np.inf)
-    interval_list = []
-    woe_list = []
-    max_woe = 20
-    min_woe = -20
-    for i in range(len(bin_index) - 1):
-        if bin_index[i] == bin_index[i + 1]:
-            continue
-        else:
-            interval_list.append(pd.Interval(left=bin_index[i], right=bin_index[i + 1], closed='left'))
-            rate_event = bins[0.0][bin_index[i]] / bins[0.0].sum()
-            rate_non_event = bins[1.0][bin_index[i]] / bins[1.0].sum()
-            if rate_event == 0.0:
-                woe_list.append(min_woe)
-            elif rate_non_event == 0.0:
-                woe_list.append(max_woe)
-            else:
-                woe_list.append(
-                    math.log((bins[0.0][bin_index[i]] / bins[0.0].sum()) / (bins[1.0][bin_index[i]] / bins[1.0].sum())))
-    bins['interval'] = interval_list
-    bins['woe'] = woe_list
-    print(bins)
+    woe.my_woe(bins)
diff --git a/woe.py b/woe.py
index 08e086b..08ae216 100644
--- a/woe.py
+++ b/woe.py
@@ -158,7 +158,7 @@ def discrete(self, x):
             res[mask] = (i + 1)
         return res
 
-    def woe_feature(self,x,dict):
+    def woe_feature(self, x, dict):
         new_x = []
         for i in x:
             new_x.append(dict[i])
@@ -180,12 +180,39 @@ def WOE_MAX(self):
     def WOE_MAX(self, woe_max):
         self._WOE_MAX = woe_max
 
+
+def my_woe(bins):
+    bin_index = bins.index.values.astype(float)
+    # bin_index[0] = -np.inf
+    bin_index = np.append(bin_index, np.inf)
+    interval_list = []
+    woe_list = []
+    max_woe = 20
+    min_woe = -20
+    for i in range(len(bin_index) - 1):
+        if bin_index[i] == bin_index[i + 1]:
+            continue
+        else:
+            interval_list.append(pd.Interval(left=bin_index[i], right=bin_index[i + 1], closed='left'))
+            rate_event = bins[0.0][bin_index[i]] / bins[0.0].sum()
+            rate_non_event = bins[1.0][bin_index[i]] / bins[1.0].sum()
+            if rate_event == 0.0:
+                woe_list.append(min_woe)
+            elif rate_non_event == 0.0:
+                woe_list.append(max_woe)
+            else:
+                woe_list.append(math.log(rate_event / rate_non_event))
+    bins['interval'] = interval_list
+    bins['woe'] = woe_list
+    print(bins)
+
+
 if __name__ == '__main__':
     # path=input('Please input the file path: ')
     path = 'iris.csv'
     raw_data = pd.read_csv(path)
     # print(raw_data)
-    woe=WOE()
+    woe = WOE()
     # woe_result=woe.woe_single_x(x=raw_data,'SepalLength')
-    ret=pd.cut(raw_data['SepalLength'],5)
-    print(ret)
\ No newline at end of file
+    ret = pd.cut(raw_data['SepalLength'], 5)
+    print(ret)

From 6ad5eb031acec968198a8790d214b58613a0f524 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Fri, 29 Jun 2018 16:30:45 +0800
Subject: [PATCH 34/49] add WOE

---
 main.py | 5 ++++-
 woe.py  | 7 ++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/main.py b/main.py
index 6b87dca..bf47c83 100644
--- a/main.py
+++ b/main.py
@@ -98,4 +98,7 @@ def split_data(data_to_split):
     # print("au值: " + str(auc))
     # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
     bins = binning.chi_merge(data, 'SepalLength', 'Label', 5)
-    woe.my_woe(bins)
+
+    bin_woe = woe.my_woe(bins)
+    data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf]))
+    print(data.head(20))
diff --git a/woe.py b/woe.py
index 08ae216..abdf403 100644
--- a/woe.py
+++ b/woe.py
@@ -183,12 +183,12 @@ def WOE_MAX(self, woe_max):
 
 def my_woe(bins):
     bin_index = bins.index.values.astype(float)
-    # bin_index[0] = -np.inf
+    bin_index[0] = -np.inf
     bin_index = np.append(bin_index, np.inf)
     interval_list = []
     woe_list = []
-    max_woe = 20
-    min_woe = -20
+    max_woe = 10
+    min_woe = -10
     for i in range(len(bin_index) - 1):
         if bin_index[i] == bin_index[i + 1]:
             continue
@@ -205,6 +205,7 @@ def my_woe(bins):
     bins['interval'] = interval_list
     bins['woe'] = woe_list
     print(bins)
+    return dict(zip(interval_list, woe_list))
 
 
 if __name__ == '__main__':

From b45c3c3725d771a28e9eb04be470c8e3d515db6a Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Fri, 29 Jun 2018 18:14:40 +0800
Subject: [PATCH 35/49] add WOE

---
 binning.py |  1 -
 main.py    |  9 ++++++---
 woe.py     | 34 +++++++++++++++++++---------------
 3 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/binning.py b/binning.py
index a83f09a..f64c18a 100644
--- a/binning.py
+++ b/binning.py
@@ -105,7 +105,6 @@ def chi_merge(df, fea_name, target_name, dis_count):
             fea_count.loc[current_fea] = fea_count.loc[current_fea] + fea_count.loc[next_fea]
             fea_count.drop([next_fea], inplace=True)
             chi_list.remove(chi_list[chi_min_index + 1])
-    print(fea_count)
     return fea_count
 
 
diff --git a/main.py b/main.py
index bf47c83..17d69d6 100644
--- a/main.py
+++ b/main.py
@@ -8,6 +8,8 @@
 import modeling
 import woe
 import math
+from pandas import Interval
+from numpy import inf
 
 
 def file_info(file_path):
@@ -99,6 +101,7 @@ def split_data(data_to_split):
     # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
     bins = binning.chi_merge(data, 'SepalLength', 'Label', 5)
 
-    bin_woe = woe.my_woe(bins)
-    data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf]))
-    print(data.head(20))
+    bin_woe = woe.my_woe(data,bins)
+    # data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf])).astype(str)
+    # data[bins.index.name + '_woe'] = data[bins.index.name + '_bin'].apply(lambda x: bin_woe[x])
+    print(data)
diff --git a/woe.py b/woe.py
index abdf403..f2ce8af 100644
--- a/woe.py
+++ b/woe.py
@@ -181,9 +181,12 @@ def WOE_MAX(self, woe_max):
         self._WOE_MAX = woe_max
 
 
-def my_woe(bins):
+def my_woe(data,bins):
+    fea_name = bins.index.name
     bin_index = bins.index.values.astype(float)
     bin_index[0] = -np.inf
+    bins.index = bin_index
+    bins.index.name = fea_name
     bin_index = np.append(bin_index, np.inf)
     interval_list = []
     woe_list = []
@@ -193,7 +196,7 @@ def my_woe(bins):
         if bin_index[i] == bin_index[i + 1]:
             continue
         else:
-            interval_list.append(pd.Interval(left=bin_index[i], right=bin_index[i + 1], closed='left'))
+            interval_list.append('('+str(bin_index[i])+', '+str(bin_index[i + 1])+']')
             rate_event = bins[0.0][bin_index[i]] / bins[0.0].sum()
             rate_non_event = bins[1.0][bin_index[i]] / bins[1.0].sum()
             if rate_event == 0.0:
@@ -204,16 +207,17 @@ def my_woe(bins):
                 woe_list.append(math.log(rate_event / rate_non_event))
     bins['interval'] = interval_list
     bins['woe'] = woe_list
-    print(bins)
-    return dict(zip(interval_list, woe_list))
-
-
-if __name__ == '__main__':
-    # path=input('Please input the file path: ')
-    path = 'iris.csv'
-    raw_data = pd.read_csv(path)
-    # print(raw_data)
-    woe = WOE()
-    # woe_result=woe.woe_single_x(x=raw_data,'SepalLength')
-    ret = pd.cut(raw_data['SepalLength'], 5)
-    print(ret)
+    bin_woe=dict(zip(interval_list, woe_list))
+    data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf])).astype(str)
+    data[bins.index.name + '_woe'] = data[bins.index.name + '_bin'].apply(lambda x: bin_woe[x])
+
+
+# if __name__ == '__main__':
+#     path=input('Please input the file path: ')
+#     path = 'iris.csv'
+#     raw_data = pd.read_csv(path)
+#     print(raw_data)
+#     woe = WOE()
+#     woe_result=woe.woe_single_x(x=raw_data,'SepalLength')
+#     ret = pd.cut(raw_data['SepalLength'], 5)
+#     print(ret)

From ecbbc011631410fe1353607f82399ec8561d2af3 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Fri, 29 Jun 2018 18:16:44 +0800
Subject: [PATCH 36/49] add WOE

---
 binning.py |  1 -
 main.py    |  6 ++----
 woe.py     | 10 +++++-----
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/binning.py b/binning.py
index f64c18a..291e78f 100644
--- a/binning.py
+++ b/binning.py
@@ -93,7 +93,6 @@ def chi_merge(df, fea_name, target_name, dis_count):
         for i in range(fea_count.shape[0] - 1):
             chi_value = chi2(fea_count.iloc[i:i + 2].values)
             chi_list.append([fea_count.index[i], chi_value])
-
         chi_min_index = np.argmin(np.array(chi_list)[:, 1])
         if chi_min_index == len(chi_list) - 1:
             current_fea = chi_list[chi_min_index][0]
diff --git a/main.py b/main.py
index 17d69d6..69c12b9 100644
--- a/main.py
+++ b/main.py
@@ -99,9 +99,7 @@ def split_data(data_to_split):
     # auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
     # print("au值: " + str(auc))
     # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
-    bins = binning.chi_merge(data, 'SepalLength', 'Label', 5)
 
-    bin_woe = woe.my_woe(data,bins)
-    # data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf])).astype(str)
-    # data[bins.index.name + '_woe'] = data[bins.index.name + '_bin'].apply(lambda x: bin_woe[x])
+    bins = binning.chi_merge(data, 'SepalLength', 'Label', 5)
+    bin_woe = woe.add_woe_col(data, bins)
     print(data)
diff --git a/woe.py b/woe.py
index f2ce8af..ac516b7 100644
--- a/woe.py
+++ b/woe.py
@@ -181,7 +181,7 @@ def WOE_MAX(self, woe_max):
         self._WOE_MAX = woe_max
 
 
-def my_woe(data,bins):
+def add_woe_col(data, bins):
     fea_name = bins.index.name
     bin_index = bins.index.values.astype(float)
     bin_index[0] = -np.inf
@@ -196,7 +196,7 @@ def my_woe(data,bins):
         if bin_index[i] == bin_index[i + 1]:
             continue
         else:
-            interval_list.append('('+str(bin_index[i])+', '+str(bin_index[i + 1])+']')
+            interval_list.append('(' + str(bin_index[i]) + ', ' + str(bin_index[i + 1]) + ']')
             rate_event = bins[0.0][bin_index[i]] / bins[0.0].sum()
             rate_non_event = bins[1.0][bin_index[i]] / bins[1.0].sum()
             if rate_event == 0.0:
@@ -207,11 +207,11 @@ def my_woe(data,bins):
                 woe_list.append(math.log(rate_event / rate_non_event))
     bins['interval'] = interval_list
     bins['woe'] = woe_list
-    bin_woe=dict(zip(interval_list, woe_list))
-    data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf])).astype(str)
+    bin_woe = dict(zip(interval_list, woe_list))
+    data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf])).astype(
+        str)
     data[bins.index.name + '_woe'] = data[bins.index.name + '_bin'].apply(lambda x: bin_woe[x])
 
-
 # if __name__ == '__main__':
 #     path=input('Please input the file path: ')
 #     path = 'iris.csv'

From 23ee1ef06167883b1b3eafd6d58defc30fa96a54 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Fri, 29 Jun 2018 18:25:43 +0800
Subject: [PATCH 37/49] add WOE

---
 woe.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/woe.py b/woe.py
index ac516b7..7bb40c0 100644
--- a/woe.py
+++ b/woe.py
@@ -182,6 +182,12 @@ def WOE_MAX(self, woe_max):
 
 
 def add_woe_col(data, bins):
+    """
+    为指定特征添加一列对应的WOE值
+    :param data:原始数据
+    :param bins:分段信息
+    :return:在原始数据上添加一列
+    """
     fea_name = bins.index.name
     bin_index = bins.index.values.astype(float)
     bin_index[0] = -np.inf
@@ -205,12 +211,10 @@ def add_woe_col(data, bins):
                 woe_list.append(max_woe)
             else:
                 woe_list.append(math.log(rate_event / rate_non_event))
-    bins['interval'] = interval_list
-    bins['woe'] = woe_list
     bin_woe = dict(zip(interval_list, woe_list))
-    data[bins.index.name + '_bin'] = pd.cut(data[bins.index.name], bins=np.append(bins.index.values, [np.inf])).astype(
-        str)
-    data[bins.index.name + '_woe'] = data[bins.index.name + '_bin'].apply(lambda x: bin_woe[x])
+    data[fea_name + '_bin'] = pd.cut(data[fea_name], bins=np.append(bins.index.values, [np.inf])).astype(str)
+    data[fea_name + '_woe'] = data[fea_name + '_bin'].apply(lambda x: bin_woe[x])
+    del data[fea_name + '_bin']
 
 # if __name__ == '__main__':
 #     path=input('Please input the file path: ')

From c541709804707ee2dc0263efcd0034f24275efea Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Fri, 29 Jun 2018 18:33:57 +0800
Subject: [PATCH 38/49] add WOE

---
 binning.py | 38 +++++++++++++++++++-------------------
 woe.py     |  1 -
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/binning.py b/binning.py
index 291e78f..f4d01d0 100644
--- a/binning.py
+++ b/binning.py
@@ -6,7 +6,7 @@
 from scipy import stats
 
 
-def equal_distance_binning(df, fea_name, bin_count):
+def equal_distance_binning(df, fea_name, target_name, bin_count):
     """
     等距分箱
     :param df:
@@ -15,9 +15,12 @@ def equal_distance_binning(df, fea_name, bin_count):
     :return:
     """
     df[fea_name + '_d'] = pd.cut(df[fea_name], bin_count)
+    fea_count = df[[fea_name + '_d', target_name]].copy().groupby(
+        [fea_name + '_d', target_name]).size().unstack().fillna(0.0)
+    return fea_count
 
 
-def equal_frequency_binning(df, fea_name, bin_count):
+def equal_frequency_binning(df, fea_name, target_name, bin_count):
     """
     等频分箱
     :param df:
@@ -26,35 +29,32 @@ def equal_frequency_binning(df, fea_name, bin_count):
     :return:
     """
     df[fea_name + '_f'] = pd.cut(df[fea_name], bin_count)
+    fea_count = df[[fea_name + '_f', target_name]].copy().groupby(
+        [fea_name + '_f', target_name]).size().unstack().fillna(0.0)
+    return fea_count
 
 
-def auto_binning(df, feature_name, target_name, max_bin_count):
+def auto_binning(df, fea_name, target_name, max_bin_count):
     """
     自动分箱
     :param df:
     :param target_name: 目标变量名
-    :param feature_name:特征变量名称
+    :param fea_name:特征变量名称
     :param max_bin_count:最大分箱数
     :return:
     """
     r = 0
-    good = df[target_name].sum()
-    bad = df[target_name].count() - good
     while np.abs(r) < 1:
-        d1 = pd.DataFrame({'X': df[feature_name],
+        d1 = pd.DataFrame({'X': df[fea_name],
                            'Y': df[target_name],
-                           'Bucket': pd.qcut(df[feature_name], max_bin_count, duplicates='drop')})
-        d2 = d1.groupby('Bucket', as_index=True)
+                           fea_name + '_d': pd.qcut(df[fea_name], max_bin_count, duplicates='drop')})
+        d2 = d1.groupby(fea_name + '_d', as_index=True)
         r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
         max_bin_count = max_bin_count - 1
-    woe = np.log((d2.mean().Y / (1 - d2.mean().Y)) / (good / bad))
-    woe_dict = woe.to_dict()
-    woe_values = sorted(list(woe_dict.values()))
-    print(woe_values)
-    # 如果存在woe为inf情况，将其替换为不为inf的最大值加一
-    df[feature_name + '_woe'] = d1['Bucket'].apply(lambda x: woe_dict[x]) \
-        .replace(np.inf, woe_values[-2] + 1) \
-        .replace(-np.inf, woe_values[1] - 1)
+
+    fea_count = df[[fea_name + '_d', target_name]].copy().groupby(
+        [fea_name + '_d', target_name]).size().unstack().fillna(0.0)
+    return fea_count
 
 
 def chi2(A):
@@ -78,7 +78,7 @@ def chi2(A):
     return res
 
 
-def chi_merge(df, fea_name, target_name, dis_count):
+def chi_merge(df, fea_name, target_name, max_bin_count):
     """
     chiMerge的主算法
     :param df:数据，dataframe格式
@@ -88,7 +88,7 @@ def chi_merge(df, fea_name, target_name, dis_count):
     :return: 分割点
     """
     fea_count = df[[fea_name, target_name]].copy().groupby([fea_name, target_name]).size().unstack().fillna(0.0)
-    while fea_count.shape[0] > dis_count:
+    while fea_count.shape[0] > max_bin_count:
         chi_list = []
         for i in range(fea_count.shape[0] - 1):
             chi_value = chi2(fea_count.iloc[i:i + 2].values)
diff --git a/woe.py b/woe.py
index 7bb40c0..5e34ba9 100644
--- a/woe.py
+++ b/woe.py
@@ -1,7 +1,6 @@
 # -*- coding:utf-8 -*-
 
 import pandas as pd
-from math import log
 import numpy as np
 import math
 from scipy import stats

From ccb2c348d5c97d031ad645670875f3877fe3197d Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Fri, 29 Jun 2018 18:59:09 +0800
Subject: [PATCH 39/49] add WOE

---
 binning.py | 231 +++++++++++++++++++++++++++--------------------------
 main.py    |   6 +-
 2 files changed, 123 insertions(+), 114 deletions(-)

diff --git a/binning.py b/binning.py
index f4d01d0..211dc2a 100644
--- a/binning.py
+++ b/binning.py
@@ -6,115 +6,122 @@
 from scipy import stats
 
 
-def equal_distance_binning(df, fea_name, target_name, bin_count):
-    """
-    等距分箱
-    :param df:
-    :param fea_name:
-    :param bin_count
-    :return:
-    """
-    df[fea_name + '_d'] = pd.cut(df[fea_name], bin_count)
-    fea_count = df[[fea_name + '_d', target_name]].copy().groupby(
-        [fea_name + '_d', target_name]).size().unstack().fillna(0.0)
-    return fea_count
-
-
-def equal_frequency_binning(df, fea_name, target_name, bin_count):
-    """
-    等频分箱
-    :param df:
-    :param fea_name:
-    :param bin_count
-    :return:
-    """
-    df[fea_name + '_f'] = pd.cut(df[fea_name], bin_count)
-    fea_count = df[[fea_name + '_f', target_name]].copy().groupby(
-        [fea_name + '_f', target_name]).size().unstack().fillna(0.0)
-    return fea_count
-
-
-def auto_binning(df, fea_name, target_name, max_bin_count):
-    """
-    自动分箱
-    :param df:
-    :param target_name: 目标变量名
-    :param fea_name:特征变量名称
-    :param max_bin_count:最大分箱数
-    :return:
-    """
-    r = 0
-    while np.abs(r) < 1:
-        d1 = pd.DataFrame({'X': df[fea_name],
-                           'Y': df[target_name],
-                           fea_name + '_d': pd.qcut(df[fea_name], max_bin_count, duplicates='drop')})
-        d2 = d1.groupby(fea_name + '_d', as_index=True)
-        r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
-        max_bin_count = max_bin_count - 1
-
-    fea_count = df[[fea_name + '_d', target_name]].copy().groupby(
-        [fea_name + '_d', target_name]).size().unstack().fillna(0.0)
-    return fea_count
-
-
-def chi2(A):
-    """
-    计算卡方值
-    :param A:需要计算卡方的两行数据
-    :return: 卡方值
-    """
-    m, k = A.shape  # 行数 列数
-
-    R = A.sum(axis=1)  # 行求和结果
-    C = A.sum(axis=0)  # 列求和结果
-    N = A.sum()  # 总和
-
-    res = 0
-    for i in range(m):
-        for j in range(k):
-            Eij = 1.0 * R[i] * C[j] / N
-            if Eij != 0:
-                res = 1.0 * res + (A[i][j] - Eij) ** 2 / Eij
-    return res
-
-
-def chi_merge(df, fea_name, target_name, max_bin_count):
-    """
-    chiMerge的主算法
-    :param df:数据，dataframe格式
-    :param fea_name:需要进行分段的特征名称
-    :param target_name:目标变量名称
-    :param dis_count:最大分组数
-    :return: 分割点
-    """
-    fea_count = df[[fea_name, target_name]].copy().groupby([fea_name, target_name]).size().unstack().fillna(0.0)
-    while fea_count.shape[0] > max_bin_count:
-        chi_list = []
-        for i in range(fea_count.shape[0] - 1):
-            chi_value = chi2(fea_count.iloc[i:i + 2].values)
-            chi_list.append([fea_count.index[i], chi_value])
-        chi_min_index = np.argmin(np.array(chi_list)[:, 1])
-        if chi_min_index == len(chi_list) - 1:
-            current_fea = chi_list[chi_min_index][0]
-            fea_count.loc[current_fea] = fea_count.loc[current_fea:].sum(axis=0)
-            fea_count = fea_count.loc[:current_fea].copy()
-        else:
-            current_fea = chi_list[chi_min_index][0]
-            next_fea = chi_list[chi_min_index + 1][0]
-            fea_count.loc[current_fea] = fea_count.loc[current_fea] + fea_count.loc[next_fea]
-            fea_count.drop([next_fea], inplace=True)
-            chi_list.remove(chi_list[chi_min_index + 1])
-    return fea_count
-
-
-def discrete(path):
-    df = pd.read_csv(path)
-    target_name = df.columns[-1]
-    fea_names = df.columns[0:-1]
-    dis_count = 2
-    for f in fea_names:
-        chi_merge(df, f, target_name, dis_count)
-
-
-if __name__ == '__main__':
-    discrete('iris.csv')
+class Bin:
+    def __init__(self, df, target_name, bin_count):
+        self.df = df
+        self.target_name = target_name
+        self.bin_count = bin_count
+
+    def equal_distance_binning(self, fea_name):
+        """
+        等距分箱
+        :param df:
+        :param fea_name:
+        :param target_name:
+        :param bin_count:
+        :return:
+        """
+
+        self.df[fea_name + '_d'] = pd.cut(self.df[fea_name], self.bin_count)
+        fea_count = self.df[[fea_name + '_d', self.target_name]].copy().groupby(
+            [fea_name + '_d', self.target_name]).size().unstack().fillna(0.0)
+        return fea_count
+
+    def equal_frequency_binning(self, fea_name):
+        """
+        等频分箱
+        :param df:
+        :param fea_name:
+        :param target_name:
+        :param bin_count:
+        :return:
+        """
+        self.df[fea_name + '_f'] = pd.cut(self.df[fea_name], self.bin_count)
+        fea_count = self.df[[fea_name + '_f', self.target_name]].copy().groupby(
+            [fea_name + '_f', self.target_name]).size().unstack().fillna(0.0)
+        return fea_count
+
+    def auto_binning(self, fea_name):
+        """
+        自动分箱
+        :param df:
+        :param target_name: 目标变量名
+        :param fea_name:特征变量名称
+        :param max_bin_count:最大分箱数
+        :return:
+        """
+        r = 0
+        while np.abs(r) < 1:
+            d1 = pd.DataFrame({'X': self.df[fea_name],
+                               'Y': self.df[self.target_name],
+                               fea_name + '_d': pd.qcut(self.df[fea_name], self.bin_count,
+                                                        duplicates='drop')})
+            d2 = d1.groupby(fea_name + '_d', as_index=True)
+            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
+            max_bin_count = max_bin_count - 1
+
+        fea_count = self.df[[fea_name + '_d', self.target_name]].copy().groupby(
+            [fea_name + '_d', self.target_name]).size().unstack().fillna(0.0)
+        return fea_count
+
+    def chi2(self, A):
+        """
+        计算卡方值
+        :param A:需要计算卡方的两行数据
+        :return: 卡方值
+        """
+        m, k = A.shape  # 行数 列数
+
+        R = A.sum(axis=1)  # 行求和结果
+        C = A.sum(axis=0)  # 列求和结果
+        N = A.sum()  # 总和
+
+        res = 0
+        for i in range(m):
+            for j in range(k):
+                Eij = 1.0 * R[i] * C[j] / N
+                if Eij != 0:
+                    res = 1.0 * res + (A[i][j] - Eij) ** 2 / Eij
+        return res
+
+    def chi_merge(self, fea_name):
+        """
+        chiMerge的主算法
+        :param df:数据，dataframe格式
+        :param fea_name:需要进行分段的特征名称
+        :param target_name:目标变量名称
+        :param dis_count:最大分组数
+        :return: 分割点
+        """
+        fea_count = self.df[[fea_name, self.target_name]].copy().groupby(
+            [fea_name, self.target_name]).size().unstack().fillna(0.0)
+        while fea_count.shape[0] > self.bin_count:
+            chi_list = []
+            for i in range(fea_count.shape[0] - 1):
+                chi_value = self.chi2(fea_count.iloc[i:i + 2].values)
+                chi_list.append([fea_count.index[i], chi_value])
+            chi_min_index = np.argmin(np.array(chi_list)[:, 1])
+            if chi_min_index == len(chi_list) - 1:
+                current_fea = chi_list[chi_min_index][0]
+                fea_count.loc[current_fea] = fea_count.loc[current_fea:].sum(axis=0)
+                fea_count = fea_count.loc[:current_fea].copy()
+            else:
+                current_fea = chi_list[chi_min_index][0]
+                next_fea = chi_list[chi_min_index + 1][0]
+                fea_count.loc[current_fea] = fea_count.loc[current_fea] + fea_count.loc[next_fea]
+                fea_count.drop([next_fea], inplace=True)
+                chi_list.remove(chi_list[chi_min_index + 1])
+        return fea_count
+
+#
+# def discrete(path):
+#     df = pd.read_csv(path)
+#     target_name = df.columns[-1]
+#     fea_names = df.columns[0:-1]
+#     dis_count = 2
+#     for f in fea_names:
+#         chi_merge(df, f, target_name, dis_count)
+#
+#
+# if __name__ == '__main__':
+#     discrete('iris.csv')
diff --git a/main.py b/main.py
index 69c12b9..b68dfaf 100644
--- a/main.py
+++ b/main.py
@@ -100,6 +100,8 @@ def split_data(data_to_split):
     # print("au值: " + str(auc))
     # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
 
-    bins = binning.chi_merge(data, 'SepalLength', 'Label', 5)
-    bin_woe = woe.add_woe_col(data, bins)
+    bin = binning.Bin(data, 'Label', 5)
+    for n in data.columns.values[:-1]:
+        bins = bin.chi_merge(n)
+        woe.add_woe_col(data, bins)
     print(data)

From 1910b1ccbbd948e5f2a639fe062e71df547a1915 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Fri, 29 Jun 2018 19:47:28 +0800
Subject: [PATCH 40/49] add WOE

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index b68dfaf..e765317 100644
--- a/main.py
+++ b/main.py
@@ -104,4 +104,4 @@ def split_data(data_to_split):
     for n in data.columns.values[:-1]:
         bins = bin.chi_merge(n)
         woe.add_woe_col(data, bins)
-    print(data)
+    print(data)
\ No newline at end of file

From 7071cf7839cbc147a4f441539ecf7d8122ad9d7e Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Mon, 2 Jul 2018 14:32:01 +0800
Subject: [PATCH 41/49] add WOE

---
 feature_selection.py | 36 ++++++++++++++++++++++++++++++++++++
 main.py              | 12 +++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 feature_selection.py

diff --git a/feature_selection.py b/feature_selection.py
new file mode 100644
index 0000000..f7e4753
--- /dev/null
+++ b/feature_selection.py
@@ -0,0 +1,36 @@
+# -*- coding:utf-8 -*-
+__author__ = 'xujia'
+
+from sklearn.feature_selection import SelectKBest
+from sklearn.feature_selection import chi2
+from sklearn.feature_selection import RFE
+from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.feature_selection import SelectFromModel
+
+
+def chi2_select(X, y, number):
+    """
+    根据卡方筛选变量，
+    :param X:
+    :param y:
+    :param number:
+    :return:
+    """
+    X_new = SelectKBest(chi2, k=number).fit(X, y)
+    print(X_new.scores_)
+    return X_new
+
+
+def fea_select(X, y):
+    clf = DecisionTreeClassifier()
+    clf = clf.fit(X, y)
+    print(clf.feature_importances_)
+    model = SelectFromModel(clf, prefit=True)
+    X_new = model.transform(X)
+    print(X_new)
+
+
+from minepy import MINE
+m = MINE()
\ No newline at end of file
diff --git a/main.py b/main.py
index e765317..2932081 100644
--- a/main.py
+++ b/main.py
@@ -7,11 +7,13 @@
 import evaluate
 import modeling
 import woe
+import feature_selection
 import math
 from pandas import Interval
 from numpy import inf
 
 
+
 def file_info(file_path):
     """
     获取文件信息
@@ -77,6 +79,9 @@ def split_data(data_to_split):
     return splited_data
 
 
+
+
+
 if __name__ == '__main__':
     # path=input('Please input the file path: ')
     path = 'iris.csv'
@@ -104,4 +109,9 @@ def split_data(data_to_split):
     for n in data.columns.values[:-1]:
         bins = bin.chi_merge(n)
         woe.add_woe_col(data, bins)
-    print(data)
\ No newline at end of file
+    print(data)
+
+    # select_func = chi2_select(data[['SepalLength', 'SepalWidth']], data['Label'], 1)
+    # print(select_func.transform(data[['SepalLength', 'SepalWidth']]))
+
+    feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label'])

From 6b5f8a4158c57a6a289dab8b7d173f59a5d1f79c Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Mon, 2 Jul 2018 14:58:30 +0800
Subject: [PATCH 42/49] add mutural information

---
 feature_selection.py | 30 ++++++++++++++++++++++++++++--
 main.py              |  5 +----
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/feature_selection.py b/feature_selection.py
index f7e4753..11570ad 100644
--- a/feature_selection.py
+++ b/feature_selection.py
@@ -8,6 +8,7 @@
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.feature_selection import SelectFromModel
+from minepy import MINE
 
 
 def chi2_select(X, y, number):
@@ -24,6 +25,12 @@ def chi2_select(X, y, number):
 
 
 def fea_select(X, y):
+    """
+    使用决策树筛选变量
+    :param X:
+    :param y:
+    :return:
+    """
     clf = DecisionTreeClassifier()
     clf = clf.fit(X, y)
     print(clf.feature_importances_)
@@ -32,5 +39,24 @@ def fea_select(X, y):
     print(X_new)
 
 
-from minepy import MINE
-m = MINE()
\ No newline at end of file
+def mi(X, y):
+    """
+    计算互信息
+    :param X:
+    :param y:
+    :return:
+    """
+    mi_dict = {}
+    m = MINE()
+    try:
+        if X.shape[1] > 1:
+            for f in X.columns:
+                m.compute_score(X[f], y)
+                mi_dict[f] = m.mic()
+            print(mi_dict)
+            return mi_dict
+    except:
+        m.compute_score(X, y)
+        mi_dict[X.name] = m.mic()
+        print(mi_dict)
+        return mi_dict
diff --git a/main.py b/main.py
index 2932081..7ac4aef 100644
--- a/main.py
+++ b/main.py
@@ -13,7 +13,6 @@
 from numpy import inf
 
 
-
 def file_info(file_path):
     """
     获取文件信息
@@ -79,9 +78,6 @@ def split_data(data_to_split):
     return splited_data
 
 
-
-
-
 if __name__ == '__main__':
     # path=input('Please input the file path: ')
     path = 'iris.csv'
@@ -115,3 +111,4 @@ def split_data(data_to_split):
     # print(select_func.transform(data[['SepalLength', 'SepalWidth']]))
 
     feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label'])
+    feature_selection.mi(data['SepalWidth_woe'], data['Label'])

From c3de04f68a5ff624363c955f890f80c4c741184b Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Tue, 3 Jul 2018 10:44:28 +0800
Subject: [PATCH 43/49] add mutural information

---
 ARUtil.py            | 476 ++++++++++++++++++++++---------------------
 binning.py           |  23 +--
 evaluate.py          |  11 +
 feature_selection.py |   1 +
 main.py              |  13 +-
 woe.py               |   7 +-
 6 files changed, 280 insertions(+), 251 deletions(-)

diff --git a/ARUtil.py b/ARUtil.py
index 1cf4a9c..0281d6a 100755
--- a/ARUtil.py
+++ b/ARUtil.py
@@ -1,229 +1,247 @@
-# encoding:utf-8
-import pandas as pd
-import numpy as np
-import logging
-from scipy import stats
-
-
-class ARFilter(object):
-    def __init__(self, threshold=0.05, dest_var='y'):
-        self.threshold = threshold
-        self.dest_var = dest_var
-        logging.basicConfig()
-        self.logger = logging.getLogger("default")
-        self.logger.setLevel(level=logging.INFO)
-
-    def info_value(self):
-        """
-        信息熵
-        :return:
-        """
-        pass
-
-    def chi_square(self):
-        """
-        卡方
-        :return:
-        """
-        pass
-
-    def train_cal_input(self, excel_name='input.csv'):
-        """
-        AR值筛选
-        输入：宽表【变量1、变量2、目标变量】、筛选下限（默认0.05）、目标变量名称（默认y）
-        输出：筛选后的变量列表【变量名称,AR值】（按照AR值降序排列）
-        计算方式：使用单个变量与目标变量进行逻辑回归运算，返回模型的K-S值即为该变量的AR值。
-        """
-        from sklearn.linear_model import LogisticRegression
-        from sklearn.metrics import roc_curve
-        data = pd.read_csv(excel_name)
-        # 创建逻辑回归模型
-        logit_model = LogisticRegression()
-        final_list = []
-        for col in data.columns.values[0:-1]:
-            if col != self.dest_var:
-                # 特征变量值
-                X = data[col].values.reshape(-1, 1)
-                # 拆分数据集为训练集与测试集
-                x_train = X[:-20]
-                x_test = X[-20:]
-                # 目标变量值
-                y = data[self.dest_var].values.reshape(-1, 1)
-                y_train = y[:-20]
-                y_test = y[-20:]
-                # 数据拟合
-                logit_model.fit(x_train, y_train)
-                # 每一列与y列做预测
-                # prob = logit_model.predict_proba(data[col].values.reshape(-1, 1))
-                prob = logit_model.predict_proba(x_test)
-                # prob[:, 1] 预测结果为两列，分别为0值可能性与1值可能性，此处取1值可能性
-                # fpr, tpr, thresholds = roc_curve(data[self.dest_var].values.reshape(-1, 1), prob[:, 1])
-                fpr, tpr, thresholds = roc_curve(y_test, prob[:, 1])
-                from scipy import stats
-                # AR = float(stats.ks_2samp(y_test, prob[:, 1].reshape(-1, 1)).statistic)
-                # AR = float(stats.ks_2samp(y_test.ravel(), prob[:, 1]).statistic)
-                # testDF = pd.DataFrame()
-                # testDF['predict_proba'] = prob[:,1]
-                # testDF['label'] = np.array(y_test)
-                # print self.cal_ks(testDF)
-                # print str(AR) + "-" * 30
-                ks = abs(fpr - tpr).max()
-                # print str(ks) + "*" * 30
-                # print ks
-                if ks > self.threshold:
-                    final_list.append({'varName': col, "AR": ks})
-                else:
-                    self.logger.info('列：' + col + '的AR值为:' + str(ks) + ", 低于阈值：" + str(self.threshold))
-        # AR值排序
-        final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True)
-        self.logger.info(pd.DataFrame(final_list))
-        pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False)
-
-    def cal_ks(self, data):
-        """手动计算KS值"""
-        #  对样本数据排序，根据预测值升序排序
-        sorted_list = data.sort_values(['predict_proba'], ascending=True)
-        total_good_count = sorted_list['label'].sum() * 1.0
-        total_bad_count = (sorted_list.shape[0] - total_good_count) * 1.0
-        max_ks = 0.0
-        good_count = 0.0
-        bad_count = 0.0
-        for index, row in sorted_list.iterrows():
-            if row['label'] == 0:
-                bad_count += 1.0
-            else:
-                good_count += 1.0
-            val = abs(bad_count / total_bad_count - good_count / total_good_count)
-            max_ks = max(max_ks, val)
-        return max_ks
-
-    def cal_ar(self, excel_name='test.xlsx'):
-        excel = pd.read_excel(excel_name)
-        if excel.columns.size < 2:
-            self.logger.error("未找到Excel数据源！")
-            return
-        dest_value = excel[self.dest_var]
-        final_list = []
-        # result_frame = pd.DataFrame(columns=['varName', 'AR'])
-        for col in excel.columns:
-            if col != self.dest_var:
-                AR = float(stats.ks_2samp(excel[col], dest_value).statistic)
-                final_list
-        # self.logger.info(final_list)
-        # final_list.append({'AR': 1.0, 'colName': u'var3'})
-        # final_list.append({'AR': 0.8, 'colName': u'var4'})
-        final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True)
-        # self.logger.info("final result:" + str(final_list))
-        # self.logger.info("123")
-        self.logger.info(pd.DataFrame(final_list))
-        pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False)
-
-    def fill_empty_value(self, col_name, data, default_value=0):
-        """
-        缺失值填充
-        输入：宽表【变量1、变量2、目标变量】，变量名称，缺失值填充值（默认0）
-        计算方式：直接将指定变量中的缺失值用参数中的填充值进行填充
-        输出：填充后的宽表，变量缺失率
-        """
-        # data = pd.read_excel(file_name)
-        if col_name not in data.columns.values:
-            self.logger.error("输入宽表中不存在指定变量")
-            return
-        else:
-            empty_count = data[col_name].shape[0] - data[col_name].count()
-            if empty_count > 0:
-                self.logger.info('当前共' + str(data.shape[0]) + '个变量值，其中缺失值个数为' + str(empty_count))
-                # 替换空串为NAN
-                data[col_name] = data[col_name].replace(' ', np.nan).fillna(value=default_value)
-                self.logger.info('填补后，缺失值个数为' + str(data[col_name].shape[0] - data[col_name].count()))
-                # data.to_excel('result.xls', index=False)
-                return data
-            else:
-                self.logger.info('当前不存在缺失值')
-
-    def del_empty_value(self, data, empty_rate_threshold=0.5):
-        """
-        缺失值剔除
-        输入：宽表【变量1、变量2、目标变量】，缺失率（默认0.5）
-        计算方式：计算宽表中各个变量的缺失率，并剔除缺失率超过0.5的变量
-        输出：处理后宽表
-        """
-        for col in data.columns.values:
-            if col == 'y':
-                continue
-            empty_ratio = (data[col].shape[0] - data[col].count()) / data[col].shape[0]
-            if empty_ratio >= empty_rate_threshold:
-                self.logger.info("变量：" + col + "缺失率为" + str(empty_ratio) + ",高于阈值：" + str(empty_rate_threshold))
-                data = data.drop(col, axis=1)
-        return data
-        # data.to_excel(file_name.split(".")[0] + "_new." + file_name.split(".")[1], index=False)
-
-    def console_input(self, prompt="", if_value=[], else_value=[], if_rtn="", else_rtn=""):
-        rtn = input(prompt)
-        if rtn.strip() in if_value:
-            return if_rtn
-        elif rtn.strip() in else_value or len(else_value) == 0:
-            return else_rtn
-        else:
-            raise IOError("未匹配到条件")
-
-    def file_info(self, path):
-        """
-        获取文件信息
-        :param path: 文件路径
-        :return: {字段名称：[字段类型，数据量，空值个数]}
-        """
-        info_dict = {}
-        data = pd.read_csv(path)
-        for c in data.columns:
-            ctype = data[c].dtype
-            nc = data[c].size - data[c].notnull().sum()
-            info_dict[c] = [ctype, data[c].size, nc]  # 字段类型，数据量，空值个数
-        return info_dict, data
-
-    def is_contain_empty_value(self, file_dict):
-        empty_col_list = []
-        for item in file_dict:
-            self.logger.info(file_dict[item])
-            if int(file_dict[item][2]) > 0:
-                self.logger.info("列" + item + "空值个数：" + str(file_dict[item][2]))
-                empty_col_list.append(item)
-        if len(empty_col_list) > 0:
-            return True, empty_col_list
-        else:
-            return False, []
-
-    def main(self):
-        file_path = input("请输入待处理的文件名路径：")
-        import os.path
-        if os.path.isfile(file_path):
-            file_dict, data = self.file_info(file_path)
-            is_contain_empty_value, empty_col_list = self.is_contain_empty_value(file_dict)
-            if is_contain_empty_value:
-                self.logger.info("当前存在缺失值")
-                is_fill_empty = self.console_input(prompt="是否需要填充数据？1：是，其他值：否", if_value=["1"], else_value=[],
-                                                   if_rtn=True, else_rtn=False)
-                if is_fill_empty:
-                    for col in empty_col_list:
-                        fill_value = input("请输入列" + col + "待填充的数据：")
-                        self.logger.info("列" + col + "将填充数据：" + fill_value)
-                        data = self.fill_empty_value(col_name=col, data=data, default_value=fill_value)
-                    print(data)
-                else:
-                    self.logger.info("不填充数据，程序退出")
-            else:
-                self.logger.info("当前不存在缺失数据")
-        else:
-            self.logger.error("指定的文件路径不存在")
-
-
-def run():
-    ar = ARFilter()
-    # ar.train_cal_input()
-    # ar.fill_empty_value(col_name='emptyCol', file_name='empty.xls', default_value=0)
-    # ar.del_empty_value(file_name="empty_ratio.xls")
-    ar.main()
-
-
-if __name__ == "__main__":
-    run()
+# -*- coding:utf-8 -*-
+
+from sklearn import metrics
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+
+
+# class ARFilter(object):
+#     def __init__(self, threshold=0.05, dest_var='y'):
+#         self.threshold = threshold
+#         self.dest_var = dest_var
+#         logging.basicConfig()
+#         self.logger = logging.getLogger("default")
+#         self.logger.setLevel(level=logging.INFO)
+#
+#     def info_value(self):
+#         """
+#         信息熵
+#         :return:
+#         """
+#         pass
+#
+#     def chi_square(self):
+#         """
+#         卡方
+#         :return:
+#         """
+#         pass
+#
+#     def train_cal_input(self, excel_name='input.csv'):
+#         """
+#         AR值筛选
+#         输入：宽表【变量1、变量2、目标变量】、筛选下限（默认0.05）、目标变量名称（默认y）
+#         输出：筛选后的变量列表【变量名称,AR值】（按照AR值降序排列）
+#         计算方式：使用单个变量与目标变量进行逻辑回归运算，返回模型的K-S值即为该变量的AR值。
+#         """
+#         from sklearn.linear_model import LogisticRegression
+#         from sklearn.metrics import roc_curve
+#         data = pd.read_csv(excel_name)
+#         # 创建逻辑回归模型
+#         logit_model = LogisticRegression()
+#         final_list = []
+#         for col in data.columns.values[0:-1]:
+#             if col != self.dest_var:
+#                 # 特征变量值
+#                 X = data[col].values.reshape(-1, 1)
+#                 # 拆分数据集为训练集与测试集
+#                 x_train = X[:-20]
+#                 x_test = X[-20:]
+#                 # 目标变量值
+#                 y = data[self.dest_var].values.reshape(-1, 1)
+#                 y_train = y[:-20]
+#                 y_test = y[-20:]
+#                 # 数据拟合
+#                 logit_model.fit(x_train, y_train)
+#                 # 每一列与y列做预测
+#                 # prob = logit_model.predict_proba(data[col].values.reshape(-1, 1))
+#                 prob = logit_model.predict_proba(x_test)
+#                 # prob[:, 1] 预测结果为两列，分别为0值可能性与1值可能性，此处取1值可能性
+#                 # fpr, tpr, thresholds = roc_curve(data[self.dest_var].values.reshape(-1, 1), prob[:, 1])
+#                 fpr, tpr, thresholds = roc_curve(y_test, prob[:, 1])
+#                 from scipy import stats
+#                 # AR = float(stats.ks_2samp(y_test, prob[:, 1].reshape(-1, 1)).statistic)
+#                 # AR = float(stats.ks_2samp(y_test.ravel(), prob[:, 1]).statistic)
+#                 # testDF = pd.DataFrame()
+#                 # testDF['predict_proba'] = prob[:,1]
+#                 # testDF['label'] = np.array(y_test)
+#                 # print self.cal_ks(testDF)
+#                 # print str(AR) + "-" * 30
+#                 ks = abs(fpr - tpr).max()
+#                 # print str(ks) + "*" * 30
+#                 # print ks
+#                 if ks > self.threshold:
+#                     final_list.append({'varName': col, "AR": ks})
+#                 else:
+#                     self.logger.info('列：' + col + '的AR值为:' + str(ks) + ", 低于阈值：" + str(self.threshold))
+#         # AR值排序
+#         final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True)
+#         self.logger.info(pd.DataFrame(final_list))
+#         pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False)
+#
+#     def cal_ks(self, data):
+#         """
+#         手动计算KS值
+#         :param data:
+#         :return:
+#         """
+#         #  对样本数据排序，根据预测值升序排序
+#         sorted_list = data.sort_values(['predict_proba'], ascending=True)
+#         total_good_count = sorted_list['label'].sum() * 1.0
+#         total_bad_count = (sorted_list.shape[0] - total_good_count) * 1.0
+#         max_ks = 0.0
+#         good_count = 0.0
+#         bad_count = 0.0
+#         for index, row in sorted_list.iterrows():
+#             if row['label'] == 0:
+#                 bad_count += 1.0
+#             else:
+#                 good_count += 1.0
+#             val = abs(bad_count / total_bad_count - good_count / total_good_count)
+#             max_ks = max(max_ks, val)
+#         return max_ks
+#
+#     def cal_ar(self, excel_name='test.xlsx'):
+#         excel = pd.read_excel(excel_name)
+#         if excel.columns.size < 2:
+#             self.logger.error("未找到Excel数据源！")
+#             return
+#         dest_value = excel[self.dest_var]
+#         final_list = []
+#         for col in excel.columns:
+#             if col != self.dest_var:
+#                 AR = float(stats.ks_2samp(excel[col], dest_value).statistic)
+#                 final_list
+#         # self.logger.info(final_list)
+#         # final_list.append({'AR': 1.0, 'colName': u'var3'})
+#         # final_list.append({'AR': 0.8, 'colName': u'var4'})
+#         final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True)
+#         # self.logger.info("final result:" + str(final_list))
+#         # self.logger.info("123")
+#         self.logger.info(pd.DataFrame(final_list))
+#         pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False)
+#
+#     def fill_empty_value(self, col_name, data, default_value=0):
+#         """
+#         缺失值填充
+#         输入：宽表【变量1、变量2、目标变量】，变量名称，缺失值填充值（默认0）
+#         计算方式：直接将指定变量中的缺失值用参数中的填充值进行填充
+#         输出：填充后的宽表，变量缺失率
+#         """
+#         # data = pd.read_excel(file_name)
+#         if col_name not in data.columns.values:
+#             self.logger.error("输入宽表中不存在指定变量")
+#             return
+#         else:
+#             empty_count = data[col_name].shape[0] - data[col_name].count()
+#             if empty_count > 0:
+#                 self.logger.info('当前共' + str(data.shape[0]) + '个变量值，其中缺失值个数为' + str(empty_count))
+#                 # 替换空串为NAN
+#                 data[col_name] = data[col_name].replace(' ', np.nan).fillna(value=default_value)
+#                 self.logger.info('填补后，缺失值个数为' + str(data[col_name].shape[0] - data[col_name].count()))
+#                 # data.to_excel('result.xls', index=False)
+#                 return data
+#             else:
+#                 self.logger.info('当前不存在缺失值')
+#
+#     def del_empty_value(self, data, empty_rate_threshold=0.5):
+#         """
+#         缺失值剔除
+#         输入：宽表【变量1、变量2、目标变量】，缺失率（默认0.5）
+#         计算方式：计算宽表中各个变量的缺失率，并剔除缺失率超过0.5的变量
+#         输出：处理后宽表
+#         """
+#         for col in data.columns.values:
+#             if col == 'y':
+#                 continue
+#             empty_ratio = (data[col].shape[0] - data[col].count()) / data[col].shape[0]
+#             if empty_ratio >= empty_rate_threshold:
+#                 self.logger.info("变量：" + col + "缺失率为" + str(empty_ratio) + ",高于阈值：" + str(empty_rate_threshold))
+#                 data = data.drop(col, axis=1)
+#         return data
+#         # data.to_excel(file_name.split(".")[0] + "_new." + file_name.split(".")[1], index=False)
+#
+#     def console_input(self, prompt="", if_value=[], else_value=[], if_rtn="", else_rtn=""):
+#         rtn = input(prompt)
+#         if rtn.strip() in if_value:
+#             return if_rtn
+#         elif rtn.strip() in else_value or len(else_value) == 0:
+#             return else_rtn
+#         else:
+#             raise IOError("未匹配到条件")
+#
+#     def file_info(self, path):
+#         """
+#         获取文件信息
+#         :param path: 文件路径
+#         :return: {字段名称：[字段类型，数据量，空值个数]}
+#         """
+#         info_dict = {}
+#         data = pd.read_csv(path)
+#         for c in data.columns:
+#             ctype = data[c].dtype
+#             nc = data[c].size - data[c].notnull().sum()
+#             info_dict[c] = [ctype, data[c].size, nc]  # 字段类型，数据量，空值个数
+#         return info_dict, data
+#
+#     def is_contain_empty_value(self, file_dict):
+#         empty_col_list = []
+#         for item in file_dict:
+#             self.logger.info(file_dict[item])
+#             if int(file_dict[item][2]) > 0:
+#                 self.logger.info("列" + item + "空值个数：" + str(file_dict[item][2]))
+#                 empty_col_list.append(item)
+#         if len(empty_col_list) > 0:
+#             return True, empty_col_list
+#         else:
+#             return False, []
+#
+#     def main(self):
+#         file_path = input("请输入待处理的文件名路径：")
+#         import os.path
+#         if os.path.isfile(file_path):
+#             file_dict, data = self.file_info(file_path)
+#             is_contain_empty_value, empty_col_list = self.is_contain_empty_value(file_dict)
+#             if is_contain_empty_value:
+#                 self.logger.info("当前存在缺失值")
+#                 is_fill_empty = self.console_input(prompt="是否需要填充数据？1：是，其他值：否", if_value=["1"], else_value=[],
+#                                                    if_rtn=True, else_rtn=False)
+#                 if is_fill_empty:
+#                     for col in empty_col_list:
+#                         fill_value = input("请输入列" + col + "待填充的数据：")
+#                         self.logger.info("列" + col + "将填充数据：" + fill_value)
+#                         data = self.fill_empty_value(col_name=col, data=data, default_value=fill_value)
+#                     print(data)
+#                 else:
+#                     self.logger.info("不填充数据，程序退出")
+#             else:
+#                 self.logger.info("当前不存在缺失数据")
+#         else:
+#             self.logger.error("指定的文件路径不存在")
+
+
+def cal_ar(X, y):
+    """
+    计算AR值
+    :param X:
+    :param y:
+    :return:
+    """
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+    lr = LogisticRegression()
+    lr.fit(X_train.values.reshape(-1, 1), y_train)
+    pred = lr.predict_proba(X_test.values.reshape(-1, 1))
+    ar = 2.0 * metrics.roc_auc_score(y_test, pred[:, 1]) - 1.0
+    print('ar值：%s' % str(ar))
+    return ar
+
+# def run():
+#     ar = ARFilter()
+# ar.train_cal_input()
+# ar.fill_empty_value(col_name='emptyCol', file_name='empty.xls', default_value=0)
+# ar.del_empty_value(file_name="empty_ratio.xls")
+# ar.main()
+
+
+# if __name__ == "__main__":
+#     run()
diff --git a/binning.py b/binning.py
index 211dc2a..ccd6cf2 100644
--- a/binning.py
+++ b/binning.py
@@ -15,53 +15,49 @@ def __init__(self, df, target_name, bin_count):
     def equal_distance_binning(self, fea_name):
         """
         等距分箱
-        :param df:
         :param fea_name:
-        :param target_name:
-        :param bin_count:
         :return:
         """
 
         self.df[fea_name + '_d'] = pd.cut(self.df[fea_name], self.bin_count)
         fea_count = self.df[[fea_name + '_d', self.target_name]].copy().groupby(
             [fea_name + '_d', self.target_name]).size().unstack().fillna(0.0)
+        fea_count.index = fea_count.index.map(lambda x: x.left)
+        fea_count.index.name = fea_name
         return fea_count
 
     def equal_frequency_binning(self, fea_name):
         """
         等频分箱
-        :param df:
         :param fea_name:
-        :param target_name:
-        :param bin_count:
         :return:
         """
         self.df[fea_name + '_f'] = pd.cut(self.df[fea_name], self.bin_count)
         fea_count = self.df[[fea_name + '_f', self.target_name]].copy().groupby(
             [fea_name + '_f', self.target_name]).size().unstack().fillna(0.0)
+        fea_count.index = fea_count.index.map(lambda x: x.left)
+        fea_count.index.name = fea_name
         return fea_count
 
     def auto_binning(self, fea_name):
         """
         自动分箱
-        :param df:
-        :param target_name: 目标变量名
         :param fea_name:特征变量名称
-        :param max_bin_count:最大分箱数
         :return:
         """
         r = 0
         while np.abs(r) < 1:
             d1 = pd.DataFrame({'X': self.df[fea_name],
                                'Y': self.df[self.target_name],
-                               fea_name + '_d': pd.qcut(self.df[fea_name], self.bin_count,
-                                                        duplicates='drop')})
+                               fea_name + '_d': pd.qcut(self.df[fea_name], self.bin_count, duplicates='drop')})
             d2 = d1.groupby(fea_name + '_d', as_index=True)
             r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
-            max_bin_count = max_bin_count - 1
+            self.bin_count = self.bin_count - 1
 
         fea_count = self.df[[fea_name + '_d', self.target_name]].copy().groupby(
             [fea_name + '_d', self.target_name]).size().unstack().fillna(0.0)
+        fea_count.index = fea_count.index.map(lambda x: x.left)
+        fea_count.index.name = fea_name
         return fea_count
 
     def chi2(self, A):
@@ -87,10 +83,7 @@ def chi2(self, A):
     def chi_merge(self, fea_name):
         """
         chiMerge的主算法
-        :param df:数据，dataframe格式
         :param fea_name:需要进行分段的特征名称
-        :param target_name:目标变量名称
-        :param dis_count:最大分组数
         :return: 分割点
         """
         fea_count = self.df[[fea_name, self.target_name]].copy().groupby(
diff --git a/evaluate.py b/evaluate.py
index 9b96214..4bf7d01 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -32,3 +32,14 @@ def roc(model, test_data):
     plt.ylabel("True Positive Rate")
     plt.title("ROC Diagram")
     plt.show()
+
+
+def correlation_coef(data):
+    """
+    计算相关系数
+    :param data:
+    :return:
+    """
+    correlation = data.corr()
+    print(correlation)
+    return correlation
diff --git a/feature_selection.py b/feature_selection.py
index 11570ad..ad116e0 100644
--- a/feature_selection.py
+++ b/feature_selection.py
@@ -37,6 +37,7 @@ def fea_select(X, y):
     model = SelectFromModel(clf, prefit=True)
     X_new = model.transform(X)
     print(X_new)
+    return X_new
 
 
 def mi(X, y):
diff --git a/main.py b/main.py
index 7ac4aef..26eb76d 100644
--- a/main.py
+++ b/main.py
@@ -7,6 +7,7 @@
 import evaluate
 import modeling
 import woe
+import ARUtil
 import feature_selection
 import math
 from pandas import Interval
@@ -61,14 +62,13 @@ def change_type(df, fea_type_dict):
     df[fea_name] = df[fea_name].astype(target_type)
 
 
-def split_data(data_to_split):
+def split_data(data_to_split, ratio):
     """
     数据分割
     :param data_to_split:带分割数据
+    :param ratio:数据分割比例
     :return: （数据集1，数据集2）
     """
-    # ratio = float(input('请输入数据分割比例：'))
-    ratio = 0.8
     data_count = data_to_split.shape[0]
     selected_count = int(data_count * ratio)
     if selected_count > 0:
@@ -107,8 +107,9 @@ def split_data(data_to_split):
         woe.add_woe_col(data, bins)
     print(data)
 
-    # select_func = chi2_select(data[['SepalLength', 'SepalWidth']], data['Label'], 1)
+    # select_func = feature_selection.fea_select(data[['SepalLength', 'SepalWidth']], data['Label'], 1)
     # print(select_func.transform(data[['SepalLength', 'SepalWidth']]))
 
-    feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label'])
-    feature_selection.mi(data['SepalWidth_woe'], data['Label'])
+    # feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label'])
+    # feature_selection.mi(data['SepalWidth_woe'], data['Label'])
+    ar = ARUtil.cal_ar(data['SepalWidth_woe'], data['Label'])
diff --git a/woe.py b/woe.py
index 5e34ba9..c49b771 100644
--- a/woe.py
+++ b/woe.py
@@ -213,7 +213,12 @@ def add_woe_col(data, bins):
     bin_woe = dict(zip(interval_list, woe_list))
     data[fea_name + '_bin'] = pd.cut(data[fea_name], bins=np.append(bins.index.values, [np.inf])).astype(str)
     data[fea_name + '_woe'] = data[fea_name + '_bin'].apply(lambda x: bin_woe[x])
-    del data[fea_name + '_bin']
+    if fea_name + '_bin' in data.columns.values:
+        del data[fea_name + '_bin']
+    if fea_name + '_d' in data.columns.values:
+        del data[fea_name + '_d']
+    if fea_name + '_f' in data.columns.values:
+        del data[fea_name + '_f']
 
 # if __name__ == '__main__':
 #     path=input('Please input the file path: ')

From 6381dfb933007c59e9a37ba980dc4da348dbf68e Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Tue, 3 Jul 2018 11:06:49 +0800
Subject: [PATCH 44/49] add mutural information

---
 main.py | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/main.py b/main.py
index 26eb76d..23c2d10 100644
--- a/main.py
+++ b/main.py
@@ -12,7 +12,7 @@
 import math
 from pandas import Interval
 from numpy import inf
-
+from pprint import pprint
 
 def file_info(file_path):
     """
@@ -82,34 +82,32 @@ def split_data(data_to_split, ratio):
     # path=input('Please input the file path: ')
     path = 'iris.csv'
     fea_dict, data = file_info(path)
+    pprint(fea_dict)
     data = data.fillna(0.0)
     # change_type(data, fea_dict)
     # print(data.dtypes)
 
-    # print(t[0].shape)
-    # print(t[1].shape)
-    # binning.auto_binning(data, 'SepalLength','Label', 10)
-    # binning.auto_binning(data, 'PetalLength','Label', 10)
-    # binning.auto_binning(data, 'PetalWidth','Label',  10)
-    # train_data, test_data = split_data(data)
-    # model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
-    # predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model,
-    #                                      0.5, 100, 10)
-    # print(list(zip(test_data['Label'].values, predict_score)))
-    #
-    # auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
-    # print("au值: " + str(auc))
-    # evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
-
     bin = binning.Bin(data, 'Label', 5)
     for n in data.columns.values[:-1]:
         bins = bin.chi_merge(n)
         woe.add_woe_col(data, bins)
-    print(data)
+
+    # 单变量ar值计算
+    # ar = ARUtil.cal_ar(data['SepalWidth_woe'], data['Label'])
+
+    train_data, test_data = split_data(data,0.7)
+    model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
+    predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5, 100, 10)
+    pprint(list(zip(test_data['Label'].values, predict_score)))
+    auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
+    print("au值: " + str(auc))
+    evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
+
+
 
     # select_func = feature_selection.fea_select(data[['SepalLength', 'SepalWidth']], data['Label'], 1)
     # print(select_func.transform(data[['SepalLength', 'SepalWidth']]))
 
     # feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label'])
     # feature_selection.mi(data['SepalWidth_woe'], data['Label'])
-    ar = ARUtil.cal_ar(data['SepalWidth_woe'], data['Label'])
+

From 581948598aeba4da4ef7726e381a11e905ac0c4e Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Tue, 3 Jul 2018 11:17:11 +0800
Subject: [PATCH 45/49] add mutural information

---
 main.py | 45 +++++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/main.py b/main.py
index 23c2d10..a60046d 100644
--- a/main.py
+++ b/main.py
@@ -14,6 +14,7 @@
 from numpy import inf
 from pprint import pprint
 
+
 def file_info(file_path):
     """
     获取文件信息
@@ -46,20 +47,26 @@ def change_type(df, fea_type_dict):
     print('字段名称对应数字为：')
     for (n, m) in feature_dict.items():
         print(n, m)
-    fea_name = int(input('请输入如需要更改数据类型的字段对应的数字：'))
-    if fea_name not in feature_dict.keys():
-        fea_name = int(input('输入字段名称错误，请重新输入：'))
-        if fea_name not in fea_dict.keys():
-            pass
-    fea_name = feature_dict[fea_name]
-
-    target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64)，2: 整型(int64)，3: 字符型(str)：'))
-    if target_type not in type_dict.keys():
+    if_change = input('是否需要修改字段类型？(y/n)')
+    if if_change == 'y':
+        fea_name = int(input('请输入需要更改数据类型的字段对应的数字：'))
+        if fea_name not in feature_dict.keys():
+            fea_name = int(input('输入字段名称错误，请重新输入：'))
+            if fea_name not in fea_dict.keys():
+                pass
+        fea_name = feature_dict[fea_name]
+
         target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64)，2: 整型(int64)，3: 字符型(str)：'))
         if target_type not in type_dict.keys():
-            pass
-    target_type = type_dict[target_type]
-    df[fea_name] = df[fea_name].astype(target_type)
+            target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64)，2: 整型(int64)，3: 字符型(str)：'))
+            if target_type not in type_dict.keys():
+                pass
+        target_type = type_dict[target_type]
+        df[fea_name] = df[fea_name].astype(target_type)
+    elif if_change == 'n':
+        pass
+    else:
+        pass
 
 
 def split_data(data_to_split, ratio):
@@ -82,10 +89,12 @@ def split_data(data_to_split, ratio):
     # path=input('Please input the file path: ')
     path = 'iris.csv'
     fea_dict, data = file_info(path)
+    print('字段名', '数据类型', '数据总量', '缺失值个数')
     pprint(fea_dict)
     data = data.fillna(0.0)
-    # change_type(data, fea_dict)
-    # print(data.dtypes)
+
+    change_type(data, fea_dict)
+    print(data.dtypes)
 
     bin = binning.Bin(data, 'Label', 5)
     for n in data.columns.values[:-1]:
@@ -95,12 +104,13 @@ def split_data(data_to_split, ratio):
     # 单变量ar值计算
     # ar = ARUtil.cal_ar(data['SepalWidth_woe'], data['Label'])
 
-    train_data, test_data = split_data(data,0.7)
+    train_data, test_data = split_data(data, 0.7)
     model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
-    predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 0.5, 100, 10)
+    predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model,
+                                         0.5, 100, 10)
     pprint(list(zip(test_data['Label'].values, predict_score)))
     auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
-    print("au值: " + str(auc))
+    print("auc值: " + str(auc))
     evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
 
 
@@ -110,4 +120,3 @@ def split_data(data_to_split, ratio):
 
     # feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label'])
     # feature_selection.mi(data['SepalWidth_woe'], data['Label'])
-

From a6caf67f975be1348b568d7c43786c14de144e05 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Wed, 4 Jul 2018 13:27:20 +0800
Subject: [PATCH 46/49] add mutural information

---
 evaluate.py                   | 14 +++-----------
 ARUtil.py => feature_index.py | 10 ++++++++++
 feature_selection.py          |  7 ++++---
 main.py                       |  2 +-
 4 files changed, 18 insertions(+), 15 deletions(-)
 rename ARUtil.py => feature_index.py (98%)

diff --git a/evaluate.py b/evaluate.py
index 4bf7d01..e719ea1 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -6,7 +6,7 @@
 
 def auc(model, test_data):
     """
-
+    AUC
     :param model:模型
     :param test_data:测试数据，dataframe格式，第一列至倒数第二列为特征字段，最后一列为目标字段
     :return:auc值
@@ -17,7 +17,7 @@ def auc(model, test_data):
 
 def roc(model, test_data):
     """
-
+    ROC
     :param model:模型
     :param test_data:测试数据，dataframe格式，第一列至倒数第二列为特征字段，最后一列为目标字段
     :return:roc曲线
@@ -34,12 +34,4 @@ def roc(model, test_data):
     plt.show()
 
 
-def correlation_coef(data):
-    """
-    计算相关系数
-    :param data:
-    :return:
-    """
-    correlation = data.corr()
-    print(correlation)
-    return correlation
+
diff --git a/ARUtil.py b/feature_index.py
similarity index 98%
rename from ARUtil.py
rename to feature_index.py
index 0281d6a..0868e61 100755
--- a/ARUtil.py
+++ b/feature_index.py
@@ -235,6 +235,16 @@ def cal_ar(X, y):
     print('ar值：%s' % str(ar))
     return ar
 
+def correlation_coef(data):
+    """
+    计算相关系数
+    :param data:
+    :return:
+    """
+    correlation = data.corr()
+    print(correlation)
+    return correlation
+
 # def run():
 #     ar = ARFilter()
 # ar.train_cal_input()
diff --git a/feature_selection.py b/feature_selection.py
index ad116e0..6348262 100644
--- a/feature_selection.py
+++ b/feature_selection.py
@@ -3,13 +3,14 @@
 
 from sklearn.feature_selection import SelectKBest
 from sklearn.feature_selection import chi2
-from sklearn.feature_selection import RFE
-from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
 from sklearn.tree import DecisionTreeClassifier
-from sklearn.linear_model import LogisticRegression
 from sklearn.feature_selection import SelectFromModel
 from minepy import MINE
 
+from sklearn.feature_selection import RFE
+from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+
 
 def chi2_select(X, y, number):
     """
diff --git a/main.py b/main.py
index a60046d..123e722 100644
--- a/main.py
+++ b/main.py
@@ -7,7 +7,7 @@
 import evaluate
 import modeling
 import woe
-import ARUtil
+import feature_index
 import feature_selection
 import math
 from pandas import Interval

From 88c6e71d29b8c145872f32784604885b3a963caf Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Wed, 4 Jul 2018 14:15:46 +0800
Subject: [PATCH 47/49] add mutural information

---
 main.py     | 3 +--
 modeling.py | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/main.py b/main.py
index 123e722..b9b4354 100644
--- a/main.py
+++ b/main.py
@@ -106,8 +106,7 @@ def split_data(data_to_split, ratio):
 
     train_data, test_data = split_data(data, 0.7)
     model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
-    predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model,
-                                         0.5, 100, 10)
+    predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 300, 25)
     pprint(list(zip(test_data['Label'].values, predict_score)))
     auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
     print("auc值: " + str(auc))
diff --git a/modeling.py b/modeling.py
index 59434e4..ed4729f 100644
--- a/modeling.py
+++ b/modeling.py
@@ -11,9 +11,9 @@ def model(data, fea_list, target):
     return cls
 
 
-def score_trans(data, model, p, scaled_value, pdo):
-    b = pdo / np.log(2)
-    a = scaled_value + b * np.log(p)
+def score_trans(data, model, scaled_value, pdo):
+    b = -pdo / np.log(2)
+    a = scaled_value
     p = model.predict_proba(data)[:, 1]
     score = a - np.log(p / (1 - p)) * b
 

From 1eff0e1e15c1f76d12e04d53f3e73418d634f316 Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Thu, 5 Jul 2018 08:56:03 +0800
Subject: [PATCH 48/49] add mutural information

---
 main.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/main.py b/main.py
index b9b4354..76d913a 100644
--- a/main.py
+++ b/main.py
@@ -112,8 +112,6 @@ def split_data(data_to_split, ratio):
     print("auc值: " + str(auc))
     evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
 
-
-
     # select_func = feature_selection.fea_select(data[['SepalLength', 'SepalWidth']], data['Label'], 1)
     # print(select_func.transform(data[['SepalLength', 'SepalWidth']]))
 

From 8cab05d08e461a066ffc08245f223d14a44ed7ad Mon Sep 17 00:00:00 2001
From: Lansingcode <1406063770@qq.com>
Date: Thu, 5 Jul 2018 11:21:26 +0800
Subject: [PATCH 49/49] add mutural information

---
 binning.py | 4 ++++
 woe.py     | 3 +--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/binning.py b/binning.py
index ccd6cf2..ce2a17d 100644
--- a/binning.py
+++ b/binning.py
@@ -104,6 +104,10 @@ def chi_merge(self, fea_name):
                 fea_count.loc[current_fea] = fea_count.loc[current_fea] + fea_count.loc[next_fea]
                 fea_count.drop([next_fea], inplace=True)
                 chi_list.remove(chi_list[chi_min_index + 1])
+        fea_count.index = np.append([-np.inf], fea_count.index.values[1:])
+        fea_count['bin'] = pd.cut(np.append(fea_count.index.values, [np.inf]),
+                                  bins=np.append(fea_count.index.values, [np.inf]))[1:].astype(str)
+        fea_count.index.name = fea_name
         return fea_count
 
 #
diff --git a/woe.py b/woe.py
index c49b771..c3bb464 100644
--- a/woe.py
+++ b/woe.py
@@ -189,7 +189,6 @@ def add_woe_col(data, bins):
     """
     fea_name = bins.index.name
     bin_index = bins.index.values.astype(float)
-    bin_index[0] = -np.inf
     bins.index = bin_index
     bins.index.name = fea_name
     bin_index = np.append(bin_index, np.inf)
@@ -201,7 +200,7 @@ def add_woe_col(data, bins):
         if bin_index[i] == bin_index[i + 1]:
             continue
         else:
-            interval_list.append('(' + str(bin_index[i]) + ', ' + str(bin_index[i + 1]) + ']')
+            interval_list.append(bins['bin'][bin_index[i]])
             rate_event = bins[0.0][bin_index[i]] / bins[0.0].sum()
             rate_non_event = bins[1.0][bin_index[i]] / bins[1.0].sum()
             if rate_event == 0.0: