diff --git a/binning.py b/binning.py
new file mode 100644
index 0000000..ce2a17d
--- /dev/null
+++ b/binning.py
@@ -0,0 +1,124 @@
+# -*- coding:utf-8 -*-
+__author__ = 'xujia'
+
+import pandas as pd
+import numpy as np
+from scipy import stats
+
+
+class Bin:
+    def __init__(self, df, target_name, bin_count):
+        self.df = df
+        self.target_name = target_name
+        self.bin_count = bin_count
+
+    def equal_distance_binning(self, fea_name):
+        """
+        等距分箱
+        :param fea_name:
+        :return:
+        """
+
+        self.df[fea_name + '_d'] = pd.cut(self.df[fea_name], self.bin_count)
+        fea_count = self.df[[fea_name + '_d', self.target_name]].copy().groupby(
+            [fea_name + '_d', self.target_name]).size().unstack().fillna(0.0)
+        fea_count.index = fea_count.index.map(lambda x: x.left)
+        fea_count.index.name = fea_name
+        return fea_count
+
+    def equal_frequency_binning(self, fea_name):
+        """
+        等频分箱
+        :param fea_name:
+        :return:
+        """
+        self.df[fea_name + '_f'] = pd.cut(self.df[fea_name], self.bin_count)
+        fea_count = self.df[[fea_name + '_f', self.target_name]].copy().groupby(
+            [fea_name + '_f', self.target_name]).size().unstack().fillna(0.0)
+        fea_count.index = fea_count.index.map(lambda x: x.left)
+        fea_count.index.name = fea_name
+        return fea_count
+
+    def auto_binning(self, fea_name):
+        """
+        自动分箱
+        :param fea_name:特征变量名称
+        :return:
+        """
+        r = 0
+        while np.abs(r) < 1:
+            d1 = pd.DataFrame({'X': self.df[fea_name],
+                               'Y': self.df[self.target_name],
+                               fea_name + '_d': pd.qcut(self.df[fea_name], self.bin_count, duplicates='drop')})
+            d2 = d1.groupby(fea_name + '_d', as_index=True)
+            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
+            self.bin_count = self.bin_count - 1
+
+        fea_count = self.df[[fea_name + '_d', self.target_name]].copy().groupby(
+            [fea_name + '_d', self.target_name]).size().unstack().fillna(0.0)
+        fea_count.index = fea_count.index.map(lambda x: x.left)
+        fea_count.index.name = fea_name
+        return fea_count
+
+    def chi2(self, A):
+        """
+        计算卡方值
+        :param A:需要计算卡方的两行数据
+        :return: 卡方值
+        """
+        m, k = A.shape  # 行数 列数
+
+        R = A.sum(axis=1)  # 行求和结果
+        C = A.sum(axis=0)  # 列求和结果
+        N = A.sum()  # 总和
+
+        res = 0
+        for i in range(m):
+            for j in range(k):
+                Eij = 1.0 * R[i] * C[j] / N
+                if Eij != 0:
+                    res = 1.0 * res + (A[i][j] - Eij) ** 2 / Eij
+        return res
+
+    def chi_merge(self, fea_name):
+        """
+        chiMerge的主算法
+        :param fea_name:需要进行分段的特征名称
+        :return: 分割点
+        """
+        fea_count = self.df[[fea_name, self.target_name]].copy().groupby(
+            [fea_name, self.target_name]).size().unstack().fillna(0.0)
+        while fea_count.shape[0] > self.bin_count:
+            chi_list = []
+            for i in range(fea_count.shape[0] - 1):
+                chi_value = self.chi2(fea_count.iloc[i:i + 2].values)
+                chi_list.append([fea_count.index[i], chi_value])
+            chi_min_index = np.argmin(np.array(chi_list)[:, 1])
+            if chi_min_index == len(chi_list) - 1:
+                current_fea = chi_list[chi_min_index][0]
+                fea_count.loc[current_fea] = fea_count.loc[current_fea:].sum(axis=0)
+                fea_count = fea_count.loc[:current_fea].copy()
+            else:
+                current_fea = chi_list[chi_min_index][0]
+                next_fea = chi_list[chi_min_index + 1][0]
+                fea_count.loc[current_fea] = fea_count.loc[current_fea] + fea_count.loc[next_fea]
+                fea_count.drop([next_fea], inplace=True)
+                chi_list.remove(chi_list[chi_min_index + 1])
+        fea_count.index = np.append([-np.inf], fea_count.index.values[1:])
+        fea_count['bin'] = pd.cut(np.append(fea_count.index.values, [np.inf]),
+                                  bins=np.append(fea_count.index.values, [np.inf]))[1:].astype(str)
+        fea_count.index.name = fea_name
+        return fea_count
+
+#
+# def discrete(path):
+#     df = pd.read_csv(path)
+#     target_name = df.columns[-1]
+#     fea_names = df.columns[0:-1]
+#     dis_count = 2
+#     for f in fea_names:
+#         chi_merge(df, f, target_name, dis_count)
+#
+#
+# if __name__ == '__main__':
+#     discrete('iris.csv')
diff --git a/bins.py b/bins.py
deleted file mode 100644
index 44d37d3..0000000
--- a/bins.py
+++ /dev/null
@@ -1 +0,0 @@
-# -*- coding:utf-8 -*-
\ No newline at end of file
diff --git a/evaluate.py b/evaluate.py
new file mode 100644
index 0000000..e719ea1
--- /dev/null
+++ b/evaluate.py
@@ -0,0 +1,37 @@
+# -*-coding:utf-8 -*-
+
+from sklearn import metrics
+import matplotlib.pyplot as plt
+
+
+def auc(model, test_data):
+    """
+    AUC
+    :param model:模型
+    :param test_data:测试数据，dataframe格式，第一列至倒数第二列为特征字段，最后一列为目标字段
+    :return:auc值
+    """
+    predict_value = model.predict_proba(test_data.ix[:, 0:-1])[:, 1]
+    return metrics.roc_auc_score(test_data.ix[:, -1], predict_value)
+
+
+def roc(model, test_data):
+    """
+    ROC
+    :param model:模型
+    :param test_data:测试数据，dataframe格式，第一列至倒数第二列为特征字段，最后一列为目标字段
+    :return:roc曲线
+    """
+    predict_value = model.predict_proba(test_data.ix[:, 0:-1])[:, 1]
+    fpr, tpr, thresholds = metrics.roc_curve(test_data.ix[:, -1], predict_value)
+    roc_auc = metrics.auc(fpr, tpr)
+    plt.figure()
+    plt.plot(fpr, tpr, label='data1, AUC = %0.2f' % roc_auc)
+    plt.legend(loc=4)
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title("ROC Diagram")
+    plt.show()
+
+
+
diff --git a/feature_index.py b/feature_index.py
new file mode 100755
index 0000000..0868e61
--- /dev/null
+++ b/feature_index.py
@@ -0,0 +1,257 @@
+# -*- coding:utf-8 -*-
+
+from sklearn import metrics
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+
+
+# class ARFilter(object):
+#     def __init__(self, threshold=0.05, dest_var='y'):
+#         self.threshold = threshold
+#         self.dest_var = dest_var
+#         logging.basicConfig()
+#         self.logger = logging.getLogger("default")
+#         self.logger.setLevel(level=logging.INFO)
+#
+#     def info_value(self):
+#         """
+#         信息熵
+#         :return:
+#         """
+#         pass
+#
+#     def chi_square(self):
+#         """
+#         卡方
+#         :return:
+#         """
+#         pass
+#
+#     def train_cal_input(self, excel_name='input.csv'):
+#         """
+#         AR值筛选
+#         输入：宽表【变量1、变量2、目标变量】、筛选下限（默认0.05）、目标变量名称（默认y）
+#         输出：筛选后的变量列表【变量名称,AR值】（按照AR值降序排列）
+#         计算方式：使用单个变量与目标变量进行逻辑回归运算，返回模型的K-S值即为该变量的AR值。
+#         """
+#         from sklearn.linear_model import LogisticRegression
+#         from sklearn.metrics import roc_curve
+#         data = pd.read_csv(excel_name)
+#         # 创建逻辑回归模型
+#         logit_model = LogisticRegression()
+#         final_list = []
+#         for col in data.columns.values[0:-1]:
+#             if col != self.dest_var:
+#                 # 特征变量值
+#                 X = data[col].values.reshape(-1, 1)
+#                 # 拆分数据集为训练集与测试集
+#                 x_train = X[:-20]
+#                 x_test = X[-20:]
+#                 # 目标变量值
+#                 y = data[self.dest_var].values.reshape(-1, 1)
+#                 y_train = y[:-20]
+#                 y_test = y[-20:]
+#                 # 数据拟合
+#                 logit_model.fit(x_train, y_train)
+#                 # 每一列与y列做预测
+#                 # prob = logit_model.predict_proba(data[col].values.reshape(-1, 1))
+#                 prob = logit_model.predict_proba(x_test)
+#                 # prob[:, 1] 预测结果为两列，分别为0值可能性与1值可能性，此处取1值可能性
+#                 # fpr, tpr, thresholds = roc_curve(data[self.dest_var].values.reshape(-1, 1), prob[:, 1])
+#                 fpr, tpr, thresholds = roc_curve(y_test, prob[:, 1])
+#                 from scipy import stats
+#                 # AR = float(stats.ks_2samp(y_test, prob[:, 1].reshape(-1, 1)).statistic)
+#                 # AR = float(stats.ks_2samp(y_test.ravel(), prob[:, 1]).statistic)
+#                 # testDF = pd.DataFrame()
+#                 # testDF['predict_proba'] = prob[:,1]
+#                 # testDF['label'] = np.array(y_test)
+#                 # print self.cal_ks(testDF)
+#                 # print str(AR) + "-" * 30
+#                 ks = abs(fpr - tpr).max()
+#                 # print str(ks) + "*" * 30
+#                 # print ks
+#                 if ks > self.threshold:
+#                     final_list.append({'varName': col, "AR": ks})
+#                 else:
+#                     self.logger.info('列：' + col + '的AR值为:' + str(ks) + ", 低于阈值：" + str(self.threshold))
+#         # AR值排序
+#         final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True)
+#         self.logger.info(pd.DataFrame(final_list))
+#         pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False)
+#
+#     def cal_ks(self, data):
+#         """
+#         手动计算KS值
+#         :param data:
+#         :return:
+#         """
+#         #  对样本数据排序，根据预测值升序排序
+#         sorted_list = data.sort_values(['predict_proba'], ascending=True)
+#         total_good_count = sorted_list['label'].sum() * 1.0
+#         total_bad_count = (sorted_list.shape[0] - total_good_count) * 1.0
+#         max_ks = 0.0
+#         good_count = 0.0
+#         bad_count = 0.0
+#         for index, row in sorted_list.iterrows():
+#             if row['label'] == 0:
+#                 bad_count += 1.0
+#             else:
+#                 good_count += 1.0
+#             val = abs(bad_count / total_bad_count - good_count / total_good_count)
+#             max_ks = max(max_ks, val)
+#         return max_ks
+#
+#     def cal_ar(self, excel_name='test.xlsx'):
+#         excel = pd.read_excel(excel_name)
+#         if excel.columns.size < 2:
+#             self.logger.error("未找到Excel数据源！")
+#             return
+#         dest_value = excel[self.dest_var]
+#         final_list = []
+#         for col in excel.columns:
+#             if col != self.dest_var:
+#                 AR = float(stats.ks_2samp(excel[col], dest_value).statistic)
+#                 final_list
+#         # self.logger.info(final_list)
+#         # final_list.append({'AR': 1.0, 'colName': u'var3'})
+#         # final_list.append({'AR': 0.8, 'colName': u'var4'})
+#         final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True)
+#         # self.logger.info("final result:" + str(final_list))
+#         # self.logger.info("123")
+#         self.logger.info(pd.DataFrame(final_list))
+#         pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False)
+#
+#     def fill_empty_value(self, col_name, data, default_value=0):
+#         """
+#         缺失值填充
+#         输入：宽表【变量1、变量2、目标变量】，变量名称，缺失值填充值（默认0）
+#         计算方式：直接将指定变量中的缺失值用参数中的填充值进行填充
+#         输出：填充后的宽表，变量缺失率
+#         """
+#         # data = pd.read_excel(file_name)
+#         if col_name not in data.columns.values:
+#             self.logger.error("输入宽表中不存在指定变量")
+#             return
+#         else:
+#             empty_count = data[col_name].shape[0] - data[col_name].count()
+#             if empty_count > 0:
+#                 self.logger.info('当前共' + str(data.shape[0]) + '个变量值，其中缺失值个数为' + str(empty_count))
+#                 # 替换空串为NAN
+#                 data[col_name] = data[col_name].replace(' ', np.nan).fillna(value=default_value)
+#                 self.logger.info('填补后，缺失值个数为' + str(data[col_name].shape[0] - data[col_name].count()))
+#                 # data.to_excel('result.xls', index=False)
+#                 return data
+#             else:
+#                 self.logger.info('当前不存在缺失值')
+#
+#     def del_empty_value(self, data, empty_rate_threshold=0.5):
+#         """
+#         缺失值剔除
+#         输入：宽表【变量1、变量2、目标变量】，缺失率（默认0.5）
+#         计算方式：计算宽表中各个变量的缺失率，并剔除缺失率超过0.5的变量
+#         输出：处理后宽表
+#         """
+#         for col in data.columns.values:
+#             if col == 'y':
+#                 continue
+#             empty_ratio = (data[col].shape[0] - data[col].count()) / data[col].shape[0]
+#             if empty_ratio >= empty_rate_threshold:
+#                 self.logger.info("变量：" + col + "缺失率为" + str(empty_ratio) + ",高于阈值：" + str(empty_rate_threshold))
+#                 data = data.drop(col, axis=1)
+#         return data
+#         # data.to_excel(file_name.split(".")[0] + "_new." + file_name.split(".")[1], index=False)
+#
+#     def console_input(self, prompt="", if_value=[], else_value=[], if_rtn="", else_rtn=""):
+#         rtn = input(prompt)
+#         if rtn.strip() in if_value:
+#             return if_rtn
+#         elif rtn.strip() in else_value or len(else_value) == 0:
+#             return else_rtn
+#         else:
+#             raise IOError("未匹配到条件")
+#
+#     def file_info(self, path):
+#         """
+#         获取文件信息
+#         :param path: 文件路径
+#         :return: {字段名称：[字段类型，数据量，空值个数]}
+#         """
+#         info_dict = {}
+#         data = pd.read_csv(path)
+#         for c in data.columns:
+#             ctype = data[c].dtype
+#             nc = data[c].size - data[c].notnull().sum()
+#             info_dict[c] = [ctype, data[c].size, nc]  # 字段类型，数据量，空值个数
+#         return info_dict, data
+#
+#     def is_contain_empty_value(self, file_dict):
+#         empty_col_list = []
+#         for item in file_dict:
+#             self.logger.info(file_dict[item])
+#             if int(file_dict[item][2]) > 0:
+#                 self.logger.info("列" + item + "空值个数：" + str(file_dict[item][2]))
+#                 empty_col_list.append(item)
+#         if len(empty_col_list) > 0:
+#             return True, empty_col_list
+#         else:
+#             return False, []
+#
+#     def main(self):
+#         file_path = input("请输入待处理的文件名路径：")
+#         import os.path
+#         if os.path.isfile(file_path):
+#             file_dict, data = self.file_info(file_path)
+#             is_contain_empty_value, empty_col_list = self.is_contain_empty_value(file_dict)
+#             if is_contain_empty_value:
+#                 self.logger.info("当前存在缺失值")
+#                 is_fill_empty = self.console_input(prompt="是否需要填充数据？1：是，其他值：否", if_value=["1"], else_value=[],
+#                                                    if_rtn=True, else_rtn=False)
+#                 if is_fill_empty:
+#                     for col in empty_col_list:
+#                         fill_value = input("请输入列" + col + "待填充的数据：")
+#                         self.logger.info("列" + col + "将填充数据：" + fill_value)
+#                         data = self.fill_empty_value(col_name=col, data=data, default_value=fill_value)
+#                     print(data)
+#                 else:
+#                     self.logger.info("不填充数据，程序退出")
+#             else:
+#                 self.logger.info("当前不存在缺失数据")
+#         else:
+#             self.logger.error("指定的文件路径不存在")
+
+
+def cal_ar(X, y):
+    """
+    计算AR值
+    :param X:
+    :param y:
+    :return:
+    """
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+    lr = LogisticRegression()
+    lr.fit(X_train.values.reshape(-1, 1), y_train)
+    pred = lr.predict_proba(X_test.values.reshape(-1, 1))
+    ar = 2.0 * metrics.roc_auc_score(y_test, pred[:, 1]) - 1.0
+    print('ar值：%s' % str(ar))
+    return ar
+
+def correlation_coef(data):
+    """
+    计算相关系数
+    :param data:
+    :return:
+    """
+    correlation = data.corr()
+    print(correlation)
+    return correlation
+
+# def run():
+#     ar = ARFilter()
+# ar.train_cal_input()
+# ar.fill_empty_value(col_name='emptyCol', file_name='empty.xls', default_value=0)
+# ar.del_empty_value(file_name="empty_ratio.xls")
+# ar.main()
+
+
+# if __name__ == "__main__":
+#     run()
diff --git a/feature_selection.py b/feature_selection.py
new file mode 100644
index 0000000..6348262
--- /dev/null
+++ b/feature_selection.py
@@ -0,0 +1,64 @@
+# -*- coding:utf-8 -*-
+__author__ = 'xujia'
+
+from sklearn.feature_selection import SelectKBest
+from sklearn.feature_selection import chi2
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.feature_selection import SelectFromModel
+from minepy import MINE
+
+from sklearn.feature_selection import RFE
+from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+
+
+def chi2_select(X, y, number):
+    """
+    根据卡方筛选变量，
+    :param X:
+    :param y:
+    :param number:
+    :return:
+    """
+    X_new = SelectKBest(chi2, k=number).fit(X, y)
+    print(X_new.scores_)
+    return X_new
+
+
+def fea_select(X, y):
+    """
+    使用决策树筛选变量
+    :param X:
+    :param y:
+    :return:
+    """
+    clf = DecisionTreeClassifier()
+    clf = clf.fit(X, y)
+    print(clf.feature_importances_)
+    model = SelectFromModel(clf, prefit=True)
+    X_new = model.transform(X)
+    print(X_new)
+    return X_new
+
+
+def mi(X, y):
+    """
+    计算互信息
+    :param X:
+    :param y:
+    :return:
+    """
+    mi_dict = {}
+    m = MINE()
+    try:
+        if X.shape[1] > 1:
+            for f in X.columns:
+                m.compute_score(X[f], y)
+                mi_dict[f] = m.mic()
+            print(mi_dict)
+            return mi_dict
+    except:
+        m.compute_score(X, y)
+        mi_dict[X.name] = m.mic()
+        print(mi_dict)
+        return mi_dict
diff --git a/iris.csv b/iris.csv
new file mode 100644
index 0000000..517507e
--- /dev/null
+++ b/iris.csv
@@ -0,0 +1,101 @@
+SepalLength,SepalWidth,PetalLength,PetalWidth,Label
+5.1,3.5,1.4,0.2,0.0
+4.9,3.0,1.4,0.2,0.0
+4.7,3.2,1.3,0.2,0.0
+4.6,3.1,1.5,0.2,0.0
+5.0,3.6,1.4,0.2,0.0
+5.4,3.9,1.7,0.4,0.0
+4.6,3.4,1.4,0.3,0.0
+5.0,3.4,1.5,0.2,0.0
+4.4,2.9,1.4,0.2,0.0
+4.9,3.1,1.5,0.1,0.0
+5.4,3.7,1.5,0.2,0.0
+4.8,3.4,1.6,0.2,0.0
+4.8,3.0,1.4,0.1,0.0
+4.3,3.0,1.1,0.1,0.0
+5.8,4.0,1.2,0.2,0.0
+5.7,4.4,1.5,0.4,0.0
+5.4,3.9,1.3,0.4,0.0
+5.1,3.5,1.4,0.3,0.0
+5.7,3.8,1.7,0.3,0.0
+5.1,3.8,1.5,0.3,0.0
+5.4,3.4,1.7,0.2,0.0
+5.1,3.7,1.5,0.4,0.0
+4.6,3.6,1.0,0.2,0.0
+5.1,3.3,1.7,0.5,0.0
+4.8,3.4,1.9,0.2,0.0
+5.0,3.0,1.6,0.2,0.0
+5.0,3.4,1.6,0.4,0.0
+5.2,3.5,1.5,0.2,0.0
+5.2,3.4,1.4,0.2,0.0
+4.7,3.2,1.6,0.2,0.0
+4.8,3.1,1.6,0.2,0.0
+5.4,3.4,1.5,0.4,0.0
+5.2,4.1,1.5,0.1,0.0
+5.5,4.2,1.4,0.2,0.0
+4.9,3.1,1.5,0.1,0.0
+5.0,3.2,1.2,0.2,0.0
+5.5,3.5,1.3,0.2,0.0
+4.9,3.1,1.5,0.1,0.0
+4.4,3.0,1.3,0.2,0.0
+5.1,3.4,1.5,0.2,0.0
+5.0,3.5,1.3,0.3,0.0
+4.5,2.3,1.3,0.3,0.0
+4.4,3.2,1.3,0.2,0.0
+5.0,3.5,1.6,0.6,0.0
+5.1,3.8,1.9,0.4,0.0
+4.8,3.0,1.4,0.3,0.0
+5.1,3.8,1.6,0.2,0.0
+4.6,3.2,1.4,0.2,0.0
+5.3,3.7,1.5,0.2,0.0
+5.0,3.3,1.4,0.2,0.0
+7.0,3.2,4.7,1.4,1.0
+6.4,3.2,4.5,1.5,1.0
+6.9,3.1,4.9,1.5,1.0
+5.5,2.3,4.0,1.3,1.0
+6.5,2.8,4.6,1.5,1.0
+5.7,2.8,4.5,1.3,1.0
+6.3,3.3,4.7,1.6,1.0
+4.9,2.4,3.3,1.0,1.0
+6.6,2.9,4.6,1.3,1.0
+5.2,2.7,3.9,1.4,1.0
+5.0,2.0,3.5,1.0,1.0
+5.9,3.0,4.2,1.5,1.0
+6.0,2.2,4.0,1.0,1.0
+6.1,2.9,4.7,1.4,1.0
+5.6,2.9,3.6,1.3,1.0
+6.7,3.1,4.4,1.4,1.0
+5.6,3.0,4.5,1.5,1.0
+5.8,2.7,4.1,1.0,1.0
+6.2,2.2,4.5,1.5,1.0
+5.6,2.5,3.9,1.1,1.0
+5.9,3.2,4.8,1.8,1.0
+6.1,2.8,4.0,1.3,1.0
+6.3,2.5,4.9,1.5,1.0
+6.1,2.8,4.7,1.2,1.0
+6.4,2.9,4.3,1.3,1.0
+6.6,3.0,4.4,1.4,1.0
+6.8,2.8,4.8,1.4,1.0
+6.7,3.0,5.0,1.7,1.0
+6.0,2.9,4.5,1.5,1.0
+5.7,2.6,3.5,1.0,1.0
+5.5,2.4,3.8,1.1,1.0
+5.5,2.4,3.7,1.0,1.0
+5.8,2.7,3.9,1.2,1.0
+6.0,2.7,5.1,1.6,1.0
+5.4,3.0,4.5,1.5,1.0
+6.0,3.4,4.5,1.6,1.0
+6.7,3.1,4.7,1.5,1.0
+6.3,2.3,4.4,1.3,1.0
+5.6,3.0,4.1,1.3,1.0
+5.5,2.5,4.0,1.3,1.0
+5.5,2.6,4.4,1.2,1.0
+6.1,3.0,4.6,1.4,1.0
+5.8,2.6,4.0,1.2,1.0
+5.0,2.3,3.3,1.0,1.0
+5.6,2.7,4.2,1.3,1.0
+5.7,3.0,4.2,1.2,1.0
+5.7,2.9,4.2,1.3,1.0
+6.2,2.9,4.3,1.3,1.0
+5.1,2.5,3.0,1.1,1.0
+5.7,2.8,4.1,1.3,1.0
\ No newline at end of file
diff --git a/main.py b/main.py
index 44d37d3..76d913a 100644
--- a/main.py
+++ b/main.py
@@ -1 +1,119 @@
-# -*- coding:utf-8 -*-
\ No newline at end of file
+# -*- coding:utf-8 -*-
+__author__ = 'xujia'
+
+import pandas as pd
+import numpy as np
+import binning
+import evaluate
+import modeling
+import woe
+import feature_index
+import feature_selection
+import math
+from pandas import Interval
+from numpy import inf
+from pprint import pprint
+
+
+def file_info(file_path):
+    """
+    获取文件信息
+    :param file_path: 文件路径
+    :return: {字段名称：[字段类型，数据量，空值个数]}
+    """
+    info_dict = {}
+    raw_data = pd.read_csv(file_path)
+    for c in raw_data.columns:
+        c_type = raw_data[c].dtype
+        nc = raw_data[c].size - raw_data[c].notnull().sum()
+        info_dict[c] = [c_type, raw_data[c].size, nc]  # 字段类型，数据量，空值个数
+    return info_dict, raw_data
+
+
+def change_type(df, fea_type_dict):
+    """
+    改变数据类型
+    :param df:
+    :param fea_type_dict:
+    :return:
+    """
+    type_dict = {1: 'float64', 2: 'int64', 3: 'str'}
+    feature_dict = dict(zip(list(range(df.shape[1])), df.columns.values))
+
+    print('当前数据类型为：')
+    for (k, v) in fea_type_dict.items():
+        print(k.rjust(15), v[0])
+
+    print('字段名称对应数字为：')
+    for (n, m) in feature_dict.items():
+        print(n, m)
+    if_change = input('是否需要修改字段类型？(y/n)')
+    if if_change == 'y':
+        fea_name = int(input('请输入需要更改数据类型的字段对应的数字：'))
+        if fea_name not in feature_dict.keys():
+            fea_name = int(input('输入字段名称错误，请重新输入：'))
+            if fea_name not in fea_dict.keys():
+                pass
+        fea_name = feature_dict[fea_name]
+
+        target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64)，2: 整型(int64)，3: 字符型(str)：'))
+        if target_type not in type_dict.keys():
+            target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64)，2: 整型(int64)，3: 字符型(str)：'))
+            if target_type not in type_dict.keys():
+                pass
+        target_type = type_dict[target_type]
+        df[fea_name] = df[fea_name].astype(target_type)
+    elif if_change == 'n':
+        pass
+    else:
+        pass
+
+
+def split_data(data_to_split, ratio):
+    """
+    数据分割
+    :param data_to_split:带分割数据
+    :param ratio:数据分割比例
+    :return: （数据集1，数据集2）
+    """
+    data_count = data_to_split.shape[0]
+    selected_count = int(data_count * ratio)
+    if selected_count > 0:
+        splited_data = np.split(data.sample(frac=1), [selected_count], axis=0)
+    else:
+        return 'Data is too less'
+    return splited_data
+
+
+if __name__ == '__main__':
+    # path=input('Please input the file path: ')
+    path = 'iris.csv'
+    fea_dict, data = file_info(path)
+    print('字段名', '数据类型', '数据总量', '缺失值个数')
+    pprint(fea_dict)
+    data = data.fillna(0.0)
+
+    change_type(data, fea_dict)
+    print(data.dtypes)
+
+    bin = binning.Bin(data, 'Label', 5)
+    for n in data.columns.values[:-1]:
+        bins = bin.chi_merge(n)
+        woe.add_woe_col(data, bins)
+
+    # 单变量ar值计算
+    # ar = ARUtil.cal_ar(data['SepalWidth_woe'], data['Label'])
+
+    train_data, test_data = split_data(data, 0.7)
+    model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
+    predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 300, 25)
+    pprint(list(zip(test_data['Label'].values, predict_score)))
+    auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
+    print("auc值: " + str(auc))
+    evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
+
+    # select_func = feature_selection.fea_select(data[['SepalLength', 'SepalWidth']], data['Label'], 1)
+    # print(select_func.transform(data[['SepalLength', 'SepalWidth']]))
+
+    # feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label'])
+    # feature_selection.mi(data['SepalWidth_woe'], data['Label'])
diff --git a/modeling.py b/modeling.py
new file mode 100644
index 0000000..ed4729f
--- /dev/null
+++ b/modeling.py
@@ -0,0 +1,20 @@
+# -*- coding:utf-8 -*-
+__author__ = 'xujia'
+import numpy as np
+
+from sklearn.linear_model import LogisticRegression
+
+
+def model(data, fea_list, target):
+    cls = LogisticRegression()
+    cls.fit(data[fea_list], data[target])
+    return cls
+
+
+def score_trans(data, model, scaled_value, pdo):
+    b = -pdo / np.log(2)
+    a = scaled_value
+    p = model.predict_proba(data)[:, 1]
+    score = a - np.log(p / (1 - p)) * b
+
+    return score
diff --git a/woe.py b/woe.py
new file mode 100644
index 0000000..c3bb464
--- /dev/null
+++ b/woe.py
@@ -0,0 +1,230 @@
+# -*- coding:utf-8 -*-
+
+import pandas as pd
+import numpy as np
+import math
+from scipy import stats
+from sklearn.utils.multiclass import type_of_target
+
+
+class WOE:
+    def __init__(self):
+        self._WOE_MIN = -20
+        self._WOE_MAX = 20
+
+    def woe(self, X, y, event=1):
+        '''
+        Calculate woe of each feature category and information value
+        :param X: 2-D numpy array explanatory features which should be discreted already
+        :param y: 1-D numpy array target variable which should be binary
+        :param event: value of binary stands for the event to predict
+        :return: numpy array of woe dictionaries, each dictionary contains woe values for categories of each feature
+                 numpy array of information value of each feature
+        '''
+        self.check_target_binary(y)
+        X1 = self.feature_discretion(X)
+
+        res_woe = []
+        res_iv = []
+        for i in range(0, X1.shape[-1]):
+            x = X1[:, i]
+            woe_dict, iv1 = self.woe_single_x(x, y, event)
+            res_woe.append(woe_dict)
+            res_iv.append(iv1)
+        return np.array(res_woe), np.array(res_iv)
+
+    def woe_single_x(self, x, y, event=1):
+        """
+        calculate woe and information for a single feature
+        :param x: 1-D numpy starnds for single feature
+        :param y: 1-D numpy array target variable
+        :param event: value of binary stands for the event to predict
+        :return: dictionary contains woe values for categories of this feature information value of this feature
+        """
+        self.check_target_binary(y)
+
+        event_total, non_event_total = self.count_binary(y, event=event)
+        x_labels = np.unique(x)
+        woe_dict = {}
+        iv = 0
+        for x1 in x_labels:
+            y1 = y[np.where(x == x1)[0]]
+            event_count, non_event_count = self.count_binary(y1, event=event)
+            rate_event = 1.0 * event_count / event_total
+            rate_non_event = 1.0 * non_event_count / non_event_total
+            if rate_event == 0:
+                woe1 = self._WOE_MIN
+            elif rate_non_event == 0:
+                woe1 = self._WOE_MAX
+            else:
+                woe1 = math.log(rate_event / rate_non_event)
+            woe_dict[x1] = woe1
+            iv += (rate_event - rate_non_event) * woe1
+        return woe_dict, iv
+
+    def woe_replace(self, X, woe_arr):
+        """
+        replace the explanatory feature categories with its woe value
+        :param X: 2-D numpy array explanatory features which should be discreted already
+        :param woe_arr: numpy array of woe dictionaries, each dictionary contains woe values for categories of each feature
+        :return: the new numpy array in which woe values filled
+        """
+        if X.shape[-1] != woe_arr.shape[-1]:
+            raise ValueError('WOE dict array length must be equal with features length')
+
+        res = np.copy(X).astype(float)
+        idx = 0
+        for woe_dict in woe_arr:
+            for k in woe_dict.keys():
+                woe = woe_dict[k]
+                res[:, idx][np.where(res[:, idx] == k)[0]] = woe * 1.0
+            idx += 1
+        return res
+
+    def combined_iv(self, X, y, masks, event=1):
+        """
+        calcute the information value of combination features
+        :param X: 2-D numpy array explanatory features which should be discreted already
+        :param y: 1-D numpy array target variable
+        :param masks: 1-D numpy array of masks stands for which features are included in combination,
+                      e.g. np.array([0,0,1,1,1,0,0,0,0,0,1]), the length should be same as features length
+        :param event: value of binary stands for the event to predict
+        :return: woe dictionary and information value of combined features
+        """
+        if masks.shape[-1] != X.shape[-1]:
+            raise ValueError('Masks array length must be equal with features length')
+
+        x = X[:, np.where(masks == 1)[0]]
+        tmp = []
+        for i in range(x.shape[0]):
+            tmp.append(self.combine(x[i, :]))
+
+        dumy = np.array(tmp)
+        # dumy_labels = np.unique(dumy)
+        woe, iv = self.woe_single_x(dumy, y, event)
+        return woe, iv
+
+    def combine(self, list):
+        res = ''
+        for item in list:
+            res += str(item)
+        return res
+
+    def count_binary(self, a, event=1):
+        event_count = (a == event).sum()
+        non_event_count = a.shape[-1] - event_count
+        return event_count, non_event_count
+
+    def check_target_binary(self, y):
+        """
+        check if the target variable is binary, raise error if not.
+        :param y:
+        :return:
+        """
+        y_type = type_of_target(y)
+        if y_type not in ['binary']:
+            raise ValueError('Label type must be binary')
+
+    def feature_discretion(self, X):
+        """
+        Discrete the continuous features of input data X, and keep other features unchanged.
+        :param X : numpy array
+        :return: the numpy array in which all continuous features are discreted
+        """
+        temp = []
+        for i in range(0, X.shape[-1]):
+            x = X[:, i]
+            x_type = type_of_target(x)
+            if x_type == 'continuous':
+                x1 = self.discrete(x)
+                temp.append(x1)
+            else:
+                temp.append(x)
+        return np.array(temp).T
+
+    def discrete(self, x):
+        """
+        Discrete the input 1-D numpy array using 5 equal percentiles
+        :param x: 1-D numpy array
+        :return: discreted 1-D numpy array
+        """
+        res = np.array([0] * x.shape[-1], dtype=int)
+        for i in range(5):
+            point1 = stats.scoreatpercentile(x, i * 20)
+            point2 = stats.scoreatpercentile(x, (i + 1) * 20)
+            x1 = x[np.where((x >= point1) & (x <= point2))]
+            mask = np.in1d(x, x1)
+            res[mask] = (i + 1)
+        return res
+
+    def woe_feature(self, x, dict):
+        new_x = []
+        for i in x:
+            new_x.append(dict[i])
+        return new_x
+
+    @property
+    def WOE_MIN(self):
+        return self._WOE_MIN
+
+    @WOE_MIN.setter
+    def WOE_MIN(self, woe_min):
+        self._WOE_MIN = woe_min
+
+    @property
+    def WOE_MAX(self):
+        return self._WOE_MAX
+
+    @WOE_MAX.setter
+    def WOE_MAX(self, woe_max):
+        self._WOE_MAX = woe_max
+
+
+def add_woe_col(data, bins):
+    """
+    为指定特征添加一列对应的WOE值
+    :param data:原始数据
+    :param bins:分段信息
+    :return:在原始数据上添加一列
+    """
+    fea_name = bins.index.name
+    bin_index = bins.index.values.astype(float)
+    bins.index = bin_index
+    bins.index.name = fea_name
+    bin_index = np.append(bin_index, np.inf)
+    interval_list = []
+    woe_list = []
+    max_woe = 10
+    min_woe = -10
+    for i in range(len(bin_index) - 1):
+        if bin_index[i] == bin_index[i + 1]:
+            continue
+        else:
+            interval_list.append(bins['bin'][bin_index[i]])
+            rate_event = bins[0.0][bin_index[i]] / bins[0.0].sum()
+            rate_non_event = bins[1.0][bin_index[i]] / bins[1.0].sum()
+            if rate_event == 0.0:
+                woe_list.append(min_woe)
+            elif rate_non_event == 0.0:
+                woe_list.append(max_woe)
+            else:
+                woe_list.append(math.log(rate_event / rate_non_event))
+    bin_woe = dict(zip(interval_list, woe_list))
+    data[fea_name + '_bin'] = pd.cut(data[fea_name], bins=np.append(bins.index.values, [np.inf])).astype(str)
+    data[fea_name + '_woe'] = data[fea_name + '_bin'].apply(lambda x: bin_woe[x])
+    if fea_name + '_bin' in data.columns.values:
+        del data[fea_name + '_bin']
+    if fea_name + '_d' in data.columns.values:
+        del data[fea_name + '_d']
+    if fea_name + '_f' in data.columns.values:
+        del data[fea_name + '_f']
+
+# if __name__ == '__main__':
+#     path=input('Please input the file path: ')
+#     path = 'iris.csv'
+#     raw_data = pd.read_csv(path)
+#     print(raw_data)
+#     woe = WOE()
+#     woe_result=woe.woe_single_x(x=raw_data,'SepalLength')
+#     ret = pd.cut(raw_data['SepalLength'], 5)
+#     print(ret)