diff --git a/binning.py b/binning.py new file mode 100644 index 0000000..ce2a17d --- /dev/null +++ b/binning.py @@ -0,0 +1,124 @@ +# -*- coding:utf-8 -*- +__author__ = 'xujia' + +import pandas as pd +import numpy as np +from scipy import stats + + +class Bin: + def __init__(self, df, target_name, bin_count): + self.df = df + self.target_name = target_name + self.bin_count = bin_count + + def equal_distance_binning(self, fea_name): + """ + 等距分箱 + :param fea_name: + :return: + """ + + self.df[fea_name + '_d'] = pd.cut(self.df[fea_name], self.bin_count) + fea_count = self.df[[fea_name + '_d', self.target_name]].copy().groupby( + [fea_name + '_d', self.target_name]).size().unstack().fillna(0.0) + fea_count.index = fea_count.index.map(lambda x: x.left) + fea_count.index.name = fea_name + return fea_count + + def equal_frequency_binning(self, fea_name): + """ + 等频分箱 + :param fea_name: + :return: + """ + self.df[fea_name + '_f'] = pd.cut(self.df[fea_name], self.bin_count) + fea_count = self.df[[fea_name + '_f', self.target_name]].copy().groupby( + [fea_name + '_f', self.target_name]).size().unstack().fillna(0.0) + fea_count.index = fea_count.index.map(lambda x: x.left) + fea_count.index.name = fea_name + return fea_count + + def auto_binning(self, fea_name): + """ + 自动分箱 + :param fea_name:特征变量名称 + :return: + """ + r = 0 + while np.abs(r) < 1: + d1 = pd.DataFrame({'X': self.df[fea_name], + 'Y': self.df[self.target_name], + fea_name + '_d': pd.qcut(self.df[fea_name], self.bin_count, duplicates='drop')}) + d2 = d1.groupby(fea_name + '_d', as_index=True) + r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) + self.bin_count = self.bin_count - 1 + + fea_count = self.df[[fea_name + '_d', self.target_name]].copy().groupby( + [fea_name + '_d', self.target_name]).size().unstack().fillna(0.0) + fea_count.index = fea_count.index.map(lambda x: x.left) + fea_count.index.name = fea_name + return fea_count + + def chi2(self, A): + """ + 计算卡方值 + :param A:需要计算卡方的两行数据 + :return: 卡方值 + """ + m, k = A.shape # 行数 列数 + + R = A.sum(axis=1) # 行求和结果 + C = A.sum(axis=0) # 列求和结果 + N = A.sum() # 总和 + + res = 0 + for i in range(m): + for j in range(k): + Eij = 1.0 * R[i] * C[j] / N + if Eij != 0: + res = 1.0 * res + (A[i][j] - Eij) ** 2 / Eij + return res + + def chi_merge(self, fea_name): + """ + chiMerge的主算法 + :param fea_name:需要进行分段的特征名称 + :return: 分割点 + """ + fea_count = self.df[[fea_name, self.target_name]].copy().groupby( + [fea_name, self.target_name]).size().unstack().fillna(0.0) + while fea_count.shape[0] > self.bin_count: + chi_list = [] + for i in range(fea_count.shape[0] - 1): + chi_value = self.chi2(fea_count.iloc[i:i + 2].values) + chi_list.append([fea_count.index[i], chi_value]) + chi_min_index = np.argmin(np.array(chi_list)[:, 1]) + if chi_min_index == len(chi_list) - 1: + current_fea = chi_list[chi_min_index][0] + fea_count.loc[current_fea] = fea_count.loc[current_fea:].sum(axis=0) + fea_count = fea_count.loc[:current_fea].copy() + else: + current_fea = chi_list[chi_min_index][0] + next_fea = chi_list[chi_min_index + 1][0] + fea_count.loc[current_fea] = fea_count.loc[current_fea] + fea_count.loc[next_fea] + fea_count.drop([next_fea], inplace=True) + chi_list.remove(chi_list[chi_min_index + 1]) + fea_count.index = np.append([-np.inf], fea_count.index.values[1:]) + fea_count['bin'] = pd.cut(np.append(fea_count.index.values, [np.inf]), + bins=np.append(fea_count.index.values, [np.inf]))[1:].astype(str) + fea_count.index.name = fea_name + return fea_count + +# +# def discrete(path): +# df = pd.read_csv(path) +# target_name = df.columns[-1] +# fea_names = df.columns[0:-1] +# dis_count = 2 +# for f in fea_names: +# chi_merge(df, f, target_name, dis_count) +# +# +# if __name__ == '__main__': +# discrete('iris.csv') diff --git a/bins.py b/bins.py deleted file mode 100644 index 44d37d3..0000000 --- a/bins.py +++ /dev/null @@ -1 +0,0 @@ -# -*- coding:utf-8 -*- \ No newline at end of file diff --git a/evaluate.py b/evaluate.py new file mode 100644 index 0000000..e719ea1 --- /dev/null +++ b/evaluate.py @@ -0,0 +1,37 @@ +# -*-coding:utf-8 -*- + +from sklearn import metrics +import matplotlib.pyplot as plt + + +def auc(model, test_data): + """ + AUC + :param model:模型 + :param test_data:测试数据,dataframe格式,第一列至倒数第二列为特征字段,最后一列为目标字段 + :return:auc值 + """ + predict_value = model.predict_proba(test_data.ix[:, 0:-1])[:, 1] + return metrics.roc_auc_score(test_data.ix[:, -1], predict_value) + + +def roc(model, test_data): + """ + ROC + :param model:模型 + :param test_data:测试数据,dataframe格式,第一列至倒数第二列为特征字段,最后一列为目标字段 + :return:roc曲线 + """ + predict_value = model.predict_proba(test_data.ix[:, 0:-1])[:, 1] + fpr, tpr, thresholds = metrics.roc_curve(test_data.ix[:, -1], predict_value) + roc_auc = metrics.auc(fpr, tpr) + plt.figure() + plt.plot(fpr, tpr, label='data1, AUC = %0.2f' % roc_auc) + plt.legend(loc=4) + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + plt.title("ROC Diagram") + plt.show() + + + diff --git a/feature_index.py b/feature_index.py new file mode 100755 index 0000000..0868e61 --- /dev/null +++ b/feature_index.py @@ -0,0 +1,257 @@ +# -*- coding:utf-8 -*- + +from sklearn import metrics +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split + + +# class ARFilter(object): +# def __init__(self, threshold=0.05, dest_var='y'): +# self.threshold = threshold +# self.dest_var = dest_var +# logging.basicConfig() +# self.logger = logging.getLogger("default") +# self.logger.setLevel(level=logging.INFO) +# +# def info_value(self): +# """ +# 信息熵 +# :return: +# """ +# pass +# +# def chi_square(self): +# """ +# 卡方 +# :return: +# """ +# pass +# +# def train_cal_input(self, excel_name='input.csv'): +# """ +# AR值筛选 +# 输入:宽表【变量1、变量2、目标变量】、筛选下限(默认0.05)、目标变量名称(默认y) +# 输出:筛选后的变量列表【变量名称,AR值】(按照AR值降序排列) +# 计算方式:使用单个变量与目标变量进行逻辑回归运算,返回模型的K-S值即为该变量的AR值。 +# """ +# from sklearn.linear_model import LogisticRegression +# from sklearn.metrics import roc_curve +# data = pd.read_csv(excel_name) +# # 创建逻辑回归模型 +# logit_model = LogisticRegression() +# final_list = [] +# for col in data.columns.values[0:-1]: +# if col != self.dest_var: +# # 特征变量值 +# X = data[col].values.reshape(-1, 1) +# # 拆分数据集为训练集与测试集 +# x_train = X[:-20] +# x_test = X[-20:] +# # 目标变量值 +# y = data[self.dest_var].values.reshape(-1, 1) +# y_train = y[:-20] +# y_test = y[-20:] +# # 数据拟合 +# logit_model.fit(x_train, y_train) +# # 每一列与y列做预测 +# # prob = logit_model.predict_proba(data[col].values.reshape(-1, 1)) +# prob = logit_model.predict_proba(x_test) +# # prob[:, 1] 预测结果为两列,分别为0值可能性与1值可能性,此处取1值可能性 +# # fpr, tpr, thresholds = roc_curve(data[self.dest_var].values.reshape(-1, 1), prob[:, 1]) +# fpr, tpr, thresholds = roc_curve(y_test, prob[:, 1]) +# from scipy import stats +# # AR = float(stats.ks_2samp(y_test, prob[:, 1].reshape(-1, 1)).statistic) +# # AR = float(stats.ks_2samp(y_test.ravel(), prob[:, 1]).statistic) +# # testDF = pd.DataFrame() +# # testDF['predict_proba'] = prob[:,1] +# # testDF['label'] = np.array(y_test) +# # print self.cal_ks(testDF) +# # print str(AR) + "-" * 30 +# ks = abs(fpr - tpr).max() +# # print str(ks) + "*" * 30 +# # print ks +# if ks > self.threshold: +# final_list.append({'varName': col, "AR": ks}) +# else: +# self.logger.info('列:' + col + '的AR值为:' + str(ks) + ", 低于阈值:" + str(self.threshold)) +# # AR值排序 +# final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True) +# self.logger.info(pd.DataFrame(final_list)) +# pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False) +# +# def cal_ks(self, data): +# """ +# 手动计算KS值 +# :param data: +# :return: +# """ +# # 对样本数据排序,根据预测值升序排序 +# sorted_list = data.sort_values(['predict_proba'], ascending=True) +# total_good_count = sorted_list['label'].sum() * 1.0 +# total_bad_count = (sorted_list.shape[0] - total_good_count) * 1.0 +# max_ks = 0.0 +# good_count = 0.0 +# bad_count = 0.0 +# for index, row in sorted_list.iterrows(): +# if row['label'] == 0: +# bad_count += 1.0 +# else: +# good_count += 1.0 +# val = abs(bad_count / total_bad_count - good_count / total_good_count) +# max_ks = max(max_ks, val) +# return max_ks +# +# def cal_ar(self, excel_name='test.xlsx'): +# excel = pd.read_excel(excel_name) +# if excel.columns.size < 2: +# self.logger.error("未找到Excel数据源!") +# return +# dest_value = excel[self.dest_var] +# final_list = [] +# for col in excel.columns: +# if col != self.dest_var: +# AR = float(stats.ks_2samp(excel[col], dest_value).statistic) +# final_list +# # self.logger.info(final_list) +# # final_list.append({'AR': 1.0, 'colName': u'var3'}) +# # final_list.append({'AR': 0.8, 'colName': u'var4'}) +# final_list.sort(key=lambda ar_dict: ar_dict['AR'], reverse=True) +# # self.logger.info("final result:" + str(final_list)) +# # self.logger.info("123") +# self.logger.info(pd.DataFrame(final_list)) +# pd.DataFrame(final_list, columns=['varName', 'AR']).to_excel('result.xlsx', index=False) +# +# def fill_empty_value(self, col_name, data, default_value=0): +# """ +# 缺失值填充 +# 输入:宽表【变量1、变量2、目标变量】,变量名称,缺失值填充值(默认0) +# 计算方式:直接将指定变量中的缺失值用参数中的填充值进行填充 +# 输出:填充后的宽表,变量缺失率 +# """ +# # data = pd.read_excel(file_name) +# if col_name not in data.columns.values: +# self.logger.error("输入宽表中不存在指定变量") +# return +# else: +# empty_count = data[col_name].shape[0] - data[col_name].count() +# if empty_count > 0: +# self.logger.info('当前共' + str(data.shape[0]) + '个变量值,其中缺失值个数为' + str(empty_count)) +# # 替换空串为NAN +# data[col_name] = data[col_name].replace(' ', np.nan).fillna(value=default_value) +# self.logger.info('填补后,缺失值个数为' + str(data[col_name].shape[0] - data[col_name].count())) +# # data.to_excel('result.xls', index=False) +# return data +# else: +# self.logger.info('当前不存在缺失值') +# +# def del_empty_value(self, data, empty_rate_threshold=0.5): +# """ +# 缺失值剔除 +# 输入:宽表【变量1、变量2、目标变量】,缺失率(默认0.5) +# 计算方式:计算宽表中各个变量的缺失率,并剔除缺失率超过0.5的变量 +# 输出:处理后宽表 +# """ +# for col in data.columns.values: +# if col == 'y': +# continue +# empty_ratio = (data[col].shape[0] - data[col].count()) / data[col].shape[0] +# if empty_ratio >= empty_rate_threshold: +# self.logger.info("变量:" + col + "缺失率为" + str(empty_ratio) + ",高于阈值:" + str(empty_rate_threshold)) +# data = data.drop(col, axis=1) +# return data +# # data.to_excel(file_name.split(".")[0] + "_new." + file_name.split(".")[1], index=False) +# +# def console_input(self, prompt="", if_value=[], else_value=[], if_rtn="", else_rtn=""): +# rtn = input(prompt) +# if rtn.strip() in if_value: +# return if_rtn +# elif rtn.strip() in else_value or len(else_value) == 0: +# return else_rtn +# else: +# raise IOError("未匹配到条件") +# +# def file_info(self, path): +# """ +# 获取文件信息 +# :param path: 文件路径 +# :return: {字段名称:[字段类型,数据量,空值个数]} +# """ +# info_dict = {} +# data = pd.read_csv(path) +# for c in data.columns: +# ctype = data[c].dtype +# nc = data[c].size - data[c].notnull().sum() +# info_dict[c] = [ctype, data[c].size, nc] # 字段类型,数据量,空值个数 +# return info_dict, data +# +# def is_contain_empty_value(self, file_dict): +# empty_col_list = [] +# for item in file_dict: +# self.logger.info(file_dict[item]) +# if int(file_dict[item][2]) > 0: +# self.logger.info("列" + item + "空值个数:" + str(file_dict[item][2])) +# empty_col_list.append(item) +# if len(empty_col_list) > 0: +# return True, empty_col_list +# else: +# return False, [] +# +# def main(self): +# file_path = input("请输入待处理的文件名路径:") +# import os.path +# if os.path.isfile(file_path): +# file_dict, data = self.file_info(file_path) +# is_contain_empty_value, empty_col_list = self.is_contain_empty_value(file_dict) +# if is_contain_empty_value: +# self.logger.info("当前存在缺失值") +# is_fill_empty = self.console_input(prompt="是否需要填充数据?1:是,其他值:否", if_value=["1"], else_value=[], +# if_rtn=True, else_rtn=False) +# if is_fill_empty: +# for col in empty_col_list: +# fill_value = input("请输入列" + col + "待填充的数据:") +# self.logger.info("列" + col + "将填充数据:" + fill_value) +# data = self.fill_empty_value(col_name=col, data=data, default_value=fill_value) +# print(data) +# else: +# self.logger.info("不填充数据,程序退出") +# else: +# self.logger.info("当前不存在缺失数据") +# else: +# self.logger.error("指定的文件路径不存在") + + +def cal_ar(X, y): + """ + 计算AR值 + :param X: + :param y: + :return: + """ + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) + lr = LogisticRegression() + lr.fit(X_train.values.reshape(-1, 1), y_train) + pred = lr.predict_proba(X_test.values.reshape(-1, 1)) + ar = 2.0 * metrics.roc_auc_score(y_test, pred[:, 1]) - 1.0 + print('ar值:%s' % str(ar)) + return ar + +def correlation_coef(data): + """ + 计算相关系数 + :param data: + :return: + """ + correlation = data.corr() + print(correlation) + return correlation + +# def run(): +# ar = ARFilter() +# ar.train_cal_input() +# ar.fill_empty_value(col_name='emptyCol', file_name='empty.xls', default_value=0) +# ar.del_empty_value(file_name="empty_ratio.xls") +# ar.main() + + +# if __name__ == "__main__": +# run() diff --git a/feature_selection.py b/feature_selection.py new file mode 100644 index 0000000..6348262 --- /dev/null +++ b/feature_selection.py @@ -0,0 +1,64 @@ +# -*- coding:utf-8 -*- +__author__ = 'xujia' + +from sklearn.feature_selection import SelectKBest +from sklearn.feature_selection import chi2 +from sklearn.tree import DecisionTreeClassifier +from sklearn.feature_selection import SelectFromModel +from minepy import MINE + +from sklearn.feature_selection import RFE +from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier +from sklearn.linear_model import LogisticRegression + + +def chi2_select(X, y, number): + """ + 根据卡方筛选变量, + :param X: + :param y: + :param number: + :return: + """ + X_new = SelectKBest(chi2, k=number).fit(X, y) + print(X_new.scores_) + return X_new + + +def fea_select(X, y): + """ + 使用决策树筛选变量 + :param X: + :param y: + :return: + """ + clf = DecisionTreeClassifier() + clf = clf.fit(X, y) + print(clf.feature_importances_) + model = SelectFromModel(clf, prefit=True) + X_new = model.transform(X) + print(X_new) + return X_new + + +def mi(X, y): + """ + 计算互信息 + :param X: + :param y: + :return: + """ + mi_dict = {} + m = MINE() + try: + if X.shape[1] > 1: + for f in X.columns: + m.compute_score(X[f], y) + mi_dict[f] = m.mic() + print(mi_dict) + return mi_dict + except: + m.compute_score(X, y) + mi_dict[X.name] = m.mic() + print(mi_dict) + return mi_dict diff --git a/iris.csv b/iris.csv new file mode 100644 index 0000000..517507e --- /dev/null +++ b/iris.csv @@ -0,0 +1,101 @@ +SepalLength,SepalWidth,PetalLength,PetalWidth,Label +5.1,3.5,1.4,0.2,0.0 +4.9,3.0,1.4,0.2,0.0 +4.7,3.2,1.3,0.2,0.0 +4.6,3.1,1.5,0.2,0.0 +5.0,3.6,1.4,0.2,0.0 +5.4,3.9,1.7,0.4,0.0 +4.6,3.4,1.4,0.3,0.0 +5.0,3.4,1.5,0.2,0.0 +4.4,2.9,1.4,0.2,0.0 +4.9,3.1,1.5,0.1,0.0 +5.4,3.7,1.5,0.2,0.0 +4.8,3.4,1.6,0.2,0.0 +4.8,3.0,1.4,0.1,0.0 +4.3,3.0,1.1,0.1,0.0 +5.8,4.0,1.2,0.2,0.0 +5.7,4.4,1.5,0.4,0.0 +5.4,3.9,1.3,0.4,0.0 +5.1,3.5,1.4,0.3,0.0 +5.7,3.8,1.7,0.3,0.0 +5.1,3.8,1.5,0.3,0.0 +5.4,3.4,1.7,0.2,0.0 +5.1,3.7,1.5,0.4,0.0 +4.6,3.6,1.0,0.2,0.0 +5.1,3.3,1.7,0.5,0.0 +4.8,3.4,1.9,0.2,0.0 +5.0,3.0,1.6,0.2,0.0 +5.0,3.4,1.6,0.4,0.0 +5.2,3.5,1.5,0.2,0.0 +5.2,3.4,1.4,0.2,0.0 +4.7,3.2,1.6,0.2,0.0 +4.8,3.1,1.6,0.2,0.0 +5.4,3.4,1.5,0.4,0.0 +5.2,4.1,1.5,0.1,0.0 +5.5,4.2,1.4,0.2,0.0 +4.9,3.1,1.5,0.1,0.0 +5.0,3.2,1.2,0.2,0.0 +5.5,3.5,1.3,0.2,0.0 +4.9,3.1,1.5,0.1,0.0 +4.4,3.0,1.3,0.2,0.0 +5.1,3.4,1.5,0.2,0.0 +5.0,3.5,1.3,0.3,0.0 +4.5,2.3,1.3,0.3,0.0 +4.4,3.2,1.3,0.2,0.0 +5.0,3.5,1.6,0.6,0.0 +5.1,3.8,1.9,0.4,0.0 +4.8,3.0,1.4,0.3,0.0 +5.1,3.8,1.6,0.2,0.0 +4.6,3.2,1.4,0.2,0.0 +5.3,3.7,1.5,0.2,0.0 +5.0,3.3,1.4,0.2,0.0 +7.0,3.2,4.7,1.4,1.0 +6.4,3.2,4.5,1.5,1.0 +6.9,3.1,4.9,1.5,1.0 +5.5,2.3,4.0,1.3,1.0 +6.5,2.8,4.6,1.5,1.0 +5.7,2.8,4.5,1.3,1.0 +6.3,3.3,4.7,1.6,1.0 +4.9,2.4,3.3,1.0,1.0 +6.6,2.9,4.6,1.3,1.0 +5.2,2.7,3.9,1.4,1.0 +5.0,2.0,3.5,1.0,1.0 +5.9,3.0,4.2,1.5,1.0 +6.0,2.2,4.0,1.0,1.0 +6.1,2.9,4.7,1.4,1.0 +5.6,2.9,3.6,1.3,1.0 +6.7,3.1,4.4,1.4,1.0 +5.6,3.0,4.5,1.5,1.0 +5.8,2.7,4.1,1.0,1.0 +6.2,2.2,4.5,1.5,1.0 +5.6,2.5,3.9,1.1,1.0 +5.9,3.2,4.8,1.8,1.0 +6.1,2.8,4.0,1.3,1.0 +6.3,2.5,4.9,1.5,1.0 +6.1,2.8,4.7,1.2,1.0 +6.4,2.9,4.3,1.3,1.0 +6.6,3.0,4.4,1.4,1.0 +6.8,2.8,4.8,1.4,1.0 +6.7,3.0,5.0,1.7,1.0 +6.0,2.9,4.5,1.5,1.0 +5.7,2.6,3.5,1.0,1.0 +5.5,2.4,3.8,1.1,1.0 +5.5,2.4,3.7,1.0,1.0 +5.8,2.7,3.9,1.2,1.0 +6.0,2.7,5.1,1.6,1.0 +5.4,3.0,4.5,1.5,1.0 +6.0,3.4,4.5,1.6,1.0 +6.7,3.1,4.7,1.5,1.0 +6.3,2.3,4.4,1.3,1.0 +5.6,3.0,4.1,1.3,1.0 +5.5,2.5,4.0,1.3,1.0 +5.5,2.6,4.4,1.2,1.0 +6.1,3.0,4.6,1.4,1.0 +5.8,2.6,4.0,1.2,1.0 +5.0,2.3,3.3,1.0,1.0 +5.6,2.7,4.2,1.3,1.0 +5.7,3.0,4.2,1.2,1.0 +5.7,2.9,4.2,1.3,1.0 +6.2,2.9,4.3,1.3,1.0 +5.1,2.5,3.0,1.1,1.0 +5.7,2.8,4.1,1.3,1.0 \ No newline at end of file diff --git a/main.py b/main.py index 44d37d3..76d913a 100644 --- a/main.py +++ b/main.py @@ -1 +1,119 @@ -# -*- coding:utf-8 -*- \ No newline at end of file +# -*- coding:utf-8 -*- +__author__ = 'xujia' + +import pandas as pd +import numpy as np +import binning +import evaluate +import modeling +import woe +import feature_index +import feature_selection +import math +from pandas import Interval +from numpy import inf +from pprint import pprint + + +def file_info(file_path): + """ + 获取文件信息 + :param file_path: 文件路径 + :return: {字段名称:[字段类型,数据量,空值个数]} + """ + info_dict = {} + raw_data = pd.read_csv(file_path) + for c in raw_data.columns: + c_type = raw_data[c].dtype + nc = raw_data[c].size - raw_data[c].notnull().sum() + info_dict[c] = [c_type, raw_data[c].size, nc] # 字段类型,数据量,空值个数 + return info_dict, raw_data + + +def change_type(df, fea_type_dict): + """ + 改变数据类型 + :param df: + :param fea_type_dict: + :return: + """ + type_dict = {1: 'float64', 2: 'int64', 3: 'str'} + feature_dict = dict(zip(list(range(df.shape[1])), df.columns.values)) + + print('当前数据类型为:') + for (k, v) in fea_type_dict.items(): + print(k.rjust(15), v[0]) + + print('字段名称对应数字为:') + for (n, m) in feature_dict.items(): + print(n, m) + if_change = input('是否需要修改字段类型?(y/n)') + if if_change == 'y': + fea_name = int(input('请输入需要更改数据类型的字段对应的数字:')) + if fea_name not in feature_dict.keys(): + fea_name = int(input('输入字段名称错误,请重新输入:')) + if fea_name not in fea_dict.keys(): + pass + fea_name = feature_dict[fea_name] + + target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):')) + if target_type not in type_dict.keys(): + target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):')) + if target_type not in type_dict.keys(): + pass + target_type = type_dict[target_type] + df[fea_name] = df[fea_name].astype(target_type) + elif if_change == 'n': + pass + else: + pass + + +def split_data(data_to_split, ratio): + """ + 数据分割 + :param data_to_split:带分割数据 + :param ratio:数据分割比例 + :return: (数据集1,数据集2) + """ + data_count = data_to_split.shape[0] + selected_count = int(data_count * ratio) + if selected_count > 0: + splited_data = np.split(data.sample(frac=1), [selected_count], axis=0) + else: + return 'Data is too less' + return splited_data + + +if __name__ == '__main__': + # path=input('Please input the file path: ') + path = 'iris.csv' + fea_dict, data = file_info(path) + print('字段名', '数据类型', '数据总量', '缺失值个数') + pprint(fea_dict) + data = data.fillna(0.0) + + change_type(data, fea_dict) + print(data.dtypes) + + bin = binning.Bin(data, 'Label', 5) + for n in data.columns.values[:-1]: + bins = bin.chi_merge(n) + woe.add_woe_col(data, bins) + + # 单变量ar值计算 + # ar = ARUtil.cal_ar(data['SepalWidth_woe'], data['Label']) + + train_data, test_data = split_data(data, 0.7) + model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label') + predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 300, 25) + pprint(list(zip(test_data['Label'].values, predict_score))) + auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) + print("auc值: " + str(auc)) + evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']]) + + # select_func = feature_selection.fea_select(data[['SepalLength', 'SepalWidth']], data['Label'], 1) + # print(select_func.transform(data[['SepalLength', 'SepalWidth']])) + + # feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label']) + # feature_selection.mi(data['SepalWidth_woe'], data['Label']) diff --git a/modeling.py b/modeling.py new file mode 100644 index 0000000..ed4729f --- /dev/null +++ b/modeling.py @@ -0,0 +1,20 @@ +# -*- coding:utf-8 -*- +__author__ = 'xujia' +import numpy as np + +from sklearn.linear_model import LogisticRegression + + +def model(data, fea_list, target): + cls = LogisticRegression() + cls.fit(data[fea_list], data[target]) + return cls + + +def score_trans(data, model, scaled_value, pdo): + b = -pdo / np.log(2) + a = scaled_value + p = model.predict_proba(data)[:, 1] + score = a - np.log(p / (1 - p)) * b + + return score diff --git a/woe.py b/woe.py new file mode 100644 index 0000000..c3bb464 --- /dev/null +++ b/woe.py @@ -0,0 +1,230 @@ +# -*- coding:utf-8 -*- + +import pandas as pd +import numpy as np +import math +from scipy import stats +from sklearn.utils.multiclass import type_of_target + + +class WOE: + def __init__(self): + self._WOE_MIN = -20 + self._WOE_MAX = 20 + + def woe(self, X, y, event=1): + ''' + Calculate woe of each feature category and information value + :param X: 2-D numpy array explanatory features which should be discreted already + :param y: 1-D numpy array target variable which should be binary + :param event: value of binary stands for the event to predict + :return: numpy array of woe dictionaries, each dictionary contains woe values for categories of each feature + numpy array of information value of each feature + ''' + self.check_target_binary(y) + X1 = self.feature_discretion(X) + + res_woe = [] + res_iv = [] + for i in range(0, X1.shape[-1]): + x = X1[:, i] + woe_dict, iv1 = self.woe_single_x(x, y, event) + res_woe.append(woe_dict) + res_iv.append(iv1) + return np.array(res_woe), np.array(res_iv) + + def woe_single_x(self, x, y, event=1): + """ + calculate woe and information for a single feature + :param x: 1-D numpy starnds for single feature + :param y: 1-D numpy array target variable + :param event: value of binary stands for the event to predict + :return: dictionary contains woe values for categories of this feature information value of this feature + """ + self.check_target_binary(y) + + event_total, non_event_total = self.count_binary(y, event=event) + x_labels = np.unique(x) + woe_dict = {} + iv = 0 + for x1 in x_labels: + y1 = y[np.where(x == x1)[0]] + event_count, non_event_count = self.count_binary(y1, event=event) + rate_event = 1.0 * event_count / event_total + rate_non_event = 1.0 * non_event_count / non_event_total + if rate_event == 0: + woe1 = self._WOE_MIN + elif rate_non_event == 0: + woe1 = self._WOE_MAX + else: + woe1 = math.log(rate_event / rate_non_event) + woe_dict[x1] = woe1 + iv += (rate_event - rate_non_event) * woe1 + return woe_dict, iv + + def woe_replace(self, X, woe_arr): + """ + replace the explanatory feature categories with its woe value + :param X: 2-D numpy array explanatory features which should be discreted already + :param woe_arr: numpy array of woe dictionaries, each dictionary contains woe values for categories of each feature + :return: the new numpy array in which woe values filled + """ + if X.shape[-1] != woe_arr.shape[-1]: + raise ValueError('WOE dict array length must be equal with features length') + + res = np.copy(X).astype(float) + idx = 0 + for woe_dict in woe_arr: + for k in woe_dict.keys(): + woe = woe_dict[k] + res[:, idx][np.where(res[:, idx] == k)[0]] = woe * 1.0 + idx += 1 + return res + + def combined_iv(self, X, y, masks, event=1): + """ + calcute the information value of combination features + :param X: 2-D numpy array explanatory features which should be discreted already + :param y: 1-D numpy array target variable + :param masks: 1-D numpy array of masks stands for which features are included in combination, + e.g. np.array([0,0,1,1,1,0,0,0,0,0,1]), the length should be same as features length + :param event: value of binary stands for the event to predict + :return: woe dictionary and information value of combined features + """ + if masks.shape[-1] != X.shape[-1]: + raise ValueError('Masks array length must be equal with features length') + + x = X[:, np.where(masks == 1)[0]] + tmp = [] + for i in range(x.shape[0]): + tmp.append(self.combine(x[i, :])) + + dumy = np.array(tmp) + # dumy_labels = np.unique(dumy) + woe, iv = self.woe_single_x(dumy, y, event) + return woe, iv + + def combine(self, list): + res = '' + for item in list: + res += str(item) + return res + + def count_binary(self, a, event=1): + event_count = (a == event).sum() + non_event_count = a.shape[-1] - event_count + return event_count, non_event_count + + def check_target_binary(self, y): + """ + check if the target variable is binary, raise error if not. + :param y: + :return: + """ + y_type = type_of_target(y) + if y_type not in ['binary']: + raise ValueError('Label type must be binary') + + def feature_discretion(self, X): + """ + Discrete the continuous features of input data X, and keep other features unchanged. + :param X : numpy array + :return: the numpy array in which all continuous features are discreted + """ + temp = [] + for i in range(0, X.shape[-1]): + x = X[:, i] + x_type = type_of_target(x) + if x_type == 'continuous': + x1 = self.discrete(x) + temp.append(x1) + else: + temp.append(x) + return np.array(temp).T + + def discrete(self, x): + """ + Discrete the input 1-D numpy array using 5 equal percentiles + :param x: 1-D numpy array + :return: discreted 1-D numpy array + """ + res = np.array([0] * x.shape[-1], dtype=int) + for i in range(5): + point1 = stats.scoreatpercentile(x, i * 20) + point2 = stats.scoreatpercentile(x, (i + 1) * 20) + x1 = x[np.where((x >= point1) & (x <= point2))] + mask = np.in1d(x, x1) + res[mask] = (i + 1) + return res + + def woe_feature(self, x, dict): + new_x = [] + for i in x: + new_x.append(dict[i]) + return new_x + + @property + def WOE_MIN(self): + return self._WOE_MIN + + @WOE_MIN.setter + def WOE_MIN(self, woe_min): + self._WOE_MIN = woe_min + + @property + def WOE_MAX(self): + return self._WOE_MAX + + @WOE_MAX.setter + def WOE_MAX(self, woe_max): + self._WOE_MAX = woe_max + + +def add_woe_col(data, bins): + """ + 为指定特征添加一列对应的WOE值 + :param data:原始数据 + :param bins:分段信息 + :return:在原始数据上添加一列 + """ + fea_name = bins.index.name + bin_index = bins.index.values.astype(float) + bins.index = bin_index + bins.index.name = fea_name + bin_index = np.append(bin_index, np.inf) + interval_list = [] + woe_list = [] + max_woe = 10 + min_woe = -10 + for i in range(len(bin_index) - 1): + if bin_index[i] == bin_index[i + 1]: + continue + else: + interval_list.append(bins['bin'][bin_index[i]]) + rate_event = bins[0.0][bin_index[i]] / bins[0.0].sum() + rate_non_event = bins[1.0][bin_index[i]] / bins[1.0].sum() + if rate_event == 0.0: + woe_list.append(min_woe) + elif rate_non_event == 0.0: + woe_list.append(max_woe) + else: + woe_list.append(math.log(rate_event / rate_non_event)) + bin_woe = dict(zip(interval_list, woe_list)) + data[fea_name + '_bin'] = pd.cut(data[fea_name], bins=np.append(bins.index.values, [np.inf])).astype(str) + data[fea_name + '_woe'] = data[fea_name + '_bin'].apply(lambda x: bin_woe[x]) + if fea_name + '_bin' in data.columns.values: + del data[fea_name + '_bin'] + if fea_name + '_d' in data.columns.values: + del data[fea_name + '_d'] + if fea_name + '_f' in data.columns.values: + del data[fea_name + '_f'] + +# if __name__ == '__main__': +# path=input('Please input the file path: ') +# path = 'iris.csv' +# raw_data = pd.read_csv(path) +# print(raw_data) +# woe = WOE() +# woe_result=woe.woe_single_x(x=raw_data,'SepalLength') +# ret = pd.cut(raw_data['SepalLength'], 5) +# print(ret)