竞赛圈 > 凤凰金融初赛榜4 baseline 42+分
凤凰金融初赛榜4 baseline 42+分
整个流程分多个脚本,其中多个脚本是用来清洗数据
脚本1整理数据,读入每个文件,并按天排序,整理成一个汇总文件all_data.csv
import pandas as pd
import os
#file_path = r"E:\DataCastle\1.凤凰金融\round2\round2"
file_list = os.listdir(r"E:\DataCastle\1.凤凰金融\round4\round4")
data_list = []
#读入每天数据
for i in range(len(file_list)):
datas = pd.read_csv(r"E:\DataCastle\1.凤凰金融\round4\round4" +"\\" + file_list[i])
datas["days"] = int(file_list[i].split(".")[0])
data_list.append(datas)
#获取表头
col_list = list(data_list[0].columns)
'''
#获取所有股票代码
code_list = []
for i in range(len(data_list)):
code_list = list(set(list(data_list[i].code) + code_list))
'''
df = pd.DataFrame()
for j in range(len(data_list)):
df = pd.concat((df,data_list[j]))
df.to_csv("all_data.csv",index=False)
脚本2观察分析,这步比较简单,画些图看看因子相关性
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv("all_data.csv")
col_list = list(data.columns)
######################################
#每个特征的数量
'''
f_times = []
for i in range(len(col_list)):
f_times.append(len(pd.unique(data[col_list[i]])))
f_df = pd.DataFrame()
f_df["col_name"] = col_list
f_df["f_times"] = f_times
'''
code_list = list(pd.unique(data.code))
def get_df(code):
global data
return data[data["code"] == code].sort_values("days").reset_index(drop=True)
df1 = get_df(code_list[2])
plt.plot(df1.close)
plt.plot(df1.f29)
plt.scatter(df1.close,df1.f29)
脚本3清洗数据,这步主要是处理股票除权及停牌,除权设计了一个下跌11%认为是除权的判断,生成new_data2.csv
import pandas as pd
import numpy as np
data = pd.read_csv("all_data.csv")
code_list = list(pd.unique(data.code)) #所有股票代码的列表
def get_df(code):
global data
b = pd.DataFrame(np.arange(1,489),columns=["days"])
a = data[data["code"] == code].sort_values("days").reset_index(drop=True)
#c = pd.merge(a,b,how="outer")
#c = c.sort_values("days").reset_index(drop=True)
#c = c.fillna(method="ffill")
#return c
return pd.merge(a,b,how="outer").sort_values("days").reset_index(drop=True).fillna(method="ffill")
code_df = [] #所有股票DF的列表
for i in range(len(code_list)):
code_df.append(get_df(code_list[i]))
print(i)
def get_cq_day(dfs):
a = dfs.copy()
a["diffs"] = a.close.diff()
a["diff_c"] = a.diffs[1:].reset_index(drop = True)
a["diff_p2"] = a.diff_c/a.close
b = a.diff_p2[0:-1]
b.index=np.arange(1,488)
a["diff_p"] = b
del a["diff_c"]
del a["diff_p2"]
a = a.fillna(0)
a["cq"] = 0
c = list(a.get(a.diff_p <= -0.11).index.values)
return c
cq_list = [] #各股票对应的除权日期的列表
for i in range(len(code_df)):
cq_list.append(get_cq_day(code_df[i]))
no_cqs = 0#统计未除权股票的数量
for i in range(len(cq_list)):
if cq_list[i] == []:
no_cqs = no_cqs+1
def c_fq_1(d):
global code_df
global cq_list
a = code_df[d].copy()
a.close[cq_list[d][0]:] = a.close[cq_list[d][0]:] * a.close[cq_list[d][0]-1] / a.close[cq_list[d][0]]
return a
def c_fq_2(d):
global code_df
global cq_list
a = code_df[d].copy()
a.close[cq_list[d][0]:] = a.close[cq_list[d][0]:] * a.close[cq_list[d][0]-1] / a.close[cq_list[d][0]]
a.close[cq_list[d][1]:] = a.close[cq_list[d][1]:] * a.close[cq_list[d][1]-1] / a.close[cq_list[d][1]]
return a
new_df_list = []
for i in range(len(code_df)):
if len(cq_list[i]) == 0:
new_df_list.append(code_df[i])
if len(cq_list[i]) == 1:
new_df_list.append(c_fq_1(i))
if len(cq_list[i]) == 2:
new_df_list.append(c_fq_2(i))
print(i)
#a = code_df[17]
#a.close[cq_list[17][0]:] = a.close[cq_list[17][0]:] * a.close[cq_list[17][0]-1] / a.close[cq_list[17][0]]
#a.close[cq_list[17][1]:] = a.close[cq_list[17][1]:] * a.close[cq_list[17][1]-1] / a.close[cq_list[17][1]]
new_all_df = pd.DataFrame()
for i in new_df_list:
new_all_df = pd.concat((new_all_df,i))
new_all_df = new_all_df.fillna(method="bfill")
new_all_df.to_csv("new_data2.csv",index=False)
脚本4获取训练样本,这步是根据清洗好的数据构造训练样本,生成训练数据.csv
import pandas as pd
import numpy as np
data = pd.read_csv("new_data2.csv")
code_list = list(pd.unique(data.code))
def get_clear_df(d):
global data
global code_list
a = data[data.code==code_list[d]].reset_index(drop = True)
a["y"] = (a.close[120:].reset_index(drop=True) - a.close[0:368])/a.close[0:368]
return a.dropna()
clear_df = pd.DataFrame()
for i in range(len(code_list)):
b = get_clear_df(i)
clear_df = pd.concat((clear_df,b))
clear_df = pd.DataFrame()
for i in range(len(code_list)):
b = get_clear_df(i)
clear_df = pd.concat((clear_df,b))
clear_df.to_csv("训练数据.csv",index=False)
脚本5分析因子与收益,筛选特征
import pandas as pd
data = pd.read_csv("训练数据.csv")
col_list = list(data.columns)
data1 = data[data.days == 1]
#下面就是怎么评估各特征与收益的关系了
import matplotlib.pyplot as plt
#1.相关系数
#pandas自带的相关系数,是皮尔逊系数,而相关系数有三种pearson spearman kendall
'''
##############################################################
def get_xgxs(n,m):
global col_list
global data
data1 = data[data.days == m]
a = data1[col_list[n]]
b = data1.y
#return a.corr(b) #在这里设置像关系数方法
return a.corr(b,method = "kendall")
days_list = list(pd.unique(data.days))
#相关系数表
corr_df = pd.DataFrame()
for i in range(1,len(days_list)+1):
corr_list = []
for j in range(2,89):
corr_list.append(get_xgxs(j,i))
col_name = "days_" + str(i)
corr_df[col_name] = corr_list
print(i)
corr_df.index=col_list[2:89]
#corr_df.to_csv("日相关系数表_Pearson.csv")
#设计一个方案,研究系数的变异和分布情况,希望找的特征的相关性尽可能稳定
corr_df2 = corr_df.T
c_mean = list(corr_df2.mean())
c_std = list(corr_df2.std())
c_max = list(corr_df2.max())
c_min = list(corr_df2.min())
c_len = list(corr_df2.max() - corr_df2.min())
corr_df["mean"] = c_mean
corr_df["std"] = c_std
corr_df["max"] = c_max
corr_df["min"] = c_min
corr_df["len"] = c_len
corr_df.to_csv("日相关系数表_kendall.csv")
#################################################################
'''
#2.与布尔型数值的相关性
def get_xgxs(n,m):
global col_list
global data
data2 = data[data.days == m]
data1 = data2.copy()
data1 = data1.sort_values("y",ascending=False).reset_index(drop=True)
data1["y_b"] = 0
data1[0:50]["y_b"] = 1
a = data1[col_list[n]]
b = data1.y
#return a.corr(b) #在这里设置像关系数方法
return a.corr(b,method = "spearman")
days_list = list(pd.unique(data.days))
#相关系数表
corr_df = pd.DataFrame()
for i in range(1,len(days_list)+1):
corr_list = []
for j in range(2,89):
corr_list.append(get_xgxs(j,i))
col_name = "days_" + str(i)
corr_df[col_name] = corr_list
print(i)
corr_df.index=col_list[2:89]
#corr_df.to_csv("日相关系数表_Pearson.csv")
#设计一个方案,研究系数的变异和分布情况,希望找的特征的相关性尽可能稳定
corr_df2 = corr_df.T
c_mean = list(corr_df2.mean())
c_std = list(corr_df2.std())
c_max = list(corr_df2.max())
c_min = list(corr_df2.min())
c_len = list(corr_df2.max() - corr_df2.min())
corr_df["mean"] = c_mean
corr_df["std"] = c_std
corr_df["max"] = c_max
corr_df["min"] = c_min
corr_df["len"] = c_len
corr_df.to_csv("日相关系数表_top50_spearman.csv")
#脚本6,构造模型
import pandas as pd
import numpy as np
data = pd.read_csv("训练数据.csv")
col_list = list(data.columns)
for i in col_list:
data[i][data[i] == np.inf] = max(data[i][data[i] != np.inf])
data[i][data[i] == -(np.inf)] = 0
data = data.fillna(0)
#删选特征
#data = data.ix[:,[58,59,60,61,89,90]]
from sklearn import preprocessing
#data = preprocessing.minmax_scale(data, feature_range=(0, 1))
from sklearn import tree
from sklearn import ensemble
from sklearn import neighbors
from sklearn import svm
from sklearn import linear_model
from sklearn import neural_network
def get_tree(n):
data1 = data[data.days == n]
# t = tree.DecisionTreeRegressor(max_depth = 2) #明天试下4层
t = ensemble.RandomForestRegressor(n_estimators = 50,
max_features = "auto",
max_depth = None,
min_samples_leaf = 50,
bootstrap = True)
#t = neighbors.KNeighborsRegressor(n_neighbors=20)
# t = svm.SVR()
# t = linear_model.LinearRegression()
# t = linear_model.Lasso()
#t = neural_network.MLPRegressor(hidden_layer_sizes = (5,5,5),
# max_iter = 50)
x = data1.ix[:,2:89]
# x = data1.ix[:,0:-2]
y = data1.ix[:,-1]
# x = preprocessing.minmax_scale(x, feature_range=(0, 1))
t.fit(x,y)
return t
trees_list = []
days = list(pd.unique(data.days))
for i in days:
trees_list.append(get_tree(i))
print(i)
p_data = pd.read_csv("488.csv")
x = p_data.ix[:,2:]
#x = p_data.ix[:,[58,59,60,61]]
points = 0
#qz = 1
for i in trees_list:
# qz = qz + 0.01
points = points + i.predict(x)
p_data["p"] = points
out_data = p_data[["code","p"]]
out_data = out_data.sort_values("p",ascending=False).reset_index(drop = True)
out_data = out_data[0:20]
del out_data["p"]
out_data.to_csv("answer08.csv",index=False)