竞赛圈   > 凤凰金融初赛榜4 baseline 42+分

luwenlong321

数据科学家   |   机器学习
  • 关注者 4
  • 关注了

luwenlong321

数据科学家   |   机器学习

凤凰金融初赛榜4 baseline 42+分

整个流程分多个脚本,其中多个脚本是用来清洗数据

脚本1整理数据,读入每个文件,并按天排序,整理成一个汇总文件all_data.csv

import pandas as pd
import os


#file_path = r"E:\DataCastle\1.凤凰金融\round2\round2"
file_list = os.listdir(r"E:\DataCastle\1.凤凰金融\round4\round4")

data_list = []

#读入每天数据
for i in range(len(file_list)):
    datas = pd.read_csv(r"E:\DataCastle\1.凤凰金融\round4\round4" +"\\" +  file_list[i])
    datas["days"] = int(file_list[i].split(".")[0])
    data_list.append(datas)

#获取表头
col_list = list(data_list[0].columns)

'''
#获取所有股票代码
code_list = []
for i in range(len(data_list)):
    code_list = list(set(list(data_list[i].code) + code_list))
'''

df = pd.DataFrame()

for j in range(len(data_list)):
    df = pd.concat((df,data_list[j]))

df.to_csv("all_data.csv",index=False)

脚本2观察分析,这步比较简单,画些图看看因子相关性

import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv("all_data.csv")

col_list = list(data.columns)


######################################
#每个特征的数量
'''
f_times = []
for i in range(len(col_list)):
    f_times.append(len(pd.unique(data[col_list[i]])))

f_df = pd.DataFrame()
f_df["col_name"] = col_list
f_df["f_times"] = f_times
'''

code_list = list(pd.unique(data.code))

def get_df(code):
    global data
    return data[data["code"] == code].sort_values("days").reset_index(drop=True)

df1 = get_df(code_list[2])
    
plt.plot(df1.close)
plt.plot(df1.f29)

plt.scatter(df1.close,df1.f29)

脚本3清洗数据,这步主要是处理股票除权及停牌,除权设计了一个下跌11%认为是除权的判断,生成new_data2.csv

import pandas as pd
import numpy as np

data = pd.read_csv("all_data.csv")

code_list = list(pd.unique(data.code)) #所有股票代码的列表

def get_df(code):
    global data
    b = pd.DataFrame(np.arange(1,489),columns=["days"])
    a = data[data["code"] == code].sort_values("days").reset_index(drop=True)
    #c = pd.merge(a,b,how="outer")
    #c = c.sort_values("days").reset_index(drop=True)
    #c = c.fillna(method="ffill")
    #return c
    return pd.merge(a,b,how="outer").sort_values("days").reset_index(drop=True).fillna(method="ffill")

code_df = [] #所有股票DF的列表
   
for i in range(len(code_list)):
    code_df.append(get_df(code_list[i]))
    print(i)



def get_cq_day(dfs):
    a = dfs.copy()
    a["diffs"] = a.close.diff()
    a["diff_c"] = a.diffs[1:].reset_index(drop = True)
    a["diff_p2"] = a.diff_c/a.close
    b = a.diff_p2[0:-1]
    b.index=np.arange(1,488)
    a["diff_p"] = b
    del a["diff_c"]
    del a["diff_p2"]
    a = a.fillna(0)
    a["cq"] = 0
    c = list(a.get(a.diff_p <= -0.11).index.values)
    return c
    
    
cq_list = []    #各股票对应的除权日期的列表
for i in range(len(code_df)):
    cq_list.append(get_cq_day(code_df[i]))

    
    
no_cqs = 0#统计未除权股票的数量
for i in range(len(cq_list)):
    if cq_list[i] == []:
        no_cqs = no_cqs+1

def c_fq_1(d):
    global code_df
    global cq_list
    a = code_df[d].copy()
    a.close[cq_list[d][0]:] = a.close[cq_list[d][0]:] * a.close[cq_list[d][0]-1] / a.close[cq_list[d][0]]  
    return a
    
def c_fq_2(d):
    global code_df
    global cq_list
    a = code_df[d].copy()    
    a.close[cq_list[d][0]:] = a.close[cq_list[d][0]:] * a.close[cq_list[d][0]-1] / a.close[cq_list[d][0]]  
    a.close[cq_list[d][1]:] = a.close[cq_list[d][1]:] * a.close[cq_list[d][1]-1] / a.close[cq_list[d][1]]
    return a    

new_df_list = []        
for i in range(len(code_df)):
    if len(cq_list[i]) == 0:
        new_df_list.append(code_df[i])
    if len(cq_list[i]) == 1:
        new_df_list.append(c_fq_1(i))
    if len(cq_list[i]) == 2:
        new_df_list.append(c_fq_2(i))
    print(i)
        
#a = code_df[17]
#a.close[cq_list[17][0]:] = a.close[cq_list[17][0]:] * a.close[cq_list[17][0]-1] / a.close[cq_list[17][0]]
#a.close[cq_list[17][1]:] = a.close[cq_list[17][1]:] * a.close[cq_list[17][1]-1] / a.close[cq_list[17][1]]

new_all_df = pd.DataFrame()
for i in new_df_list:
    new_all_df = pd.concat((new_all_df,i))

new_all_df = new_all_df.fillna(method="bfill")    
new_all_df.to_csv("new_data2.csv",index=False)

脚本4获取训练样本,这步是根据清洗好的数据构造训练样本,生成训练数据.csv

import pandas as pd
import numpy as np

data = pd.read_csv("new_data2.csv")

code_list = list(pd.unique(data.code))

def get_clear_df(d):
    global data
    global code_list
    a = data[data.code==code_list[d]].reset_index(drop = True)
    a["y"] = (a.close[120:].reset_index(drop=True) - a.close[0:368])/a.close[0:368]
    return a.dropna()


clear_df = pd.DataFrame()
for i in range(len(code_list)):
    b = get_clear_df(i)
    clear_df = pd.concat((clear_df,b))

clear_df = pd.DataFrame()
for i in range(len(code_list)):
    b = get_clear_df(i)
    clear_df = pd.concat((clear_df,b))

clear_df.to_csv("训练数据.csv",index=False)

脚本5分析因子与收益,筛选特征

import pandas as pd
data = pd.read_csv("训练数据.csv")
col_list = list(data.columns)

data1 = data[data.days == 1]

#下面就是怎么评估各特征与收益的关系了

import matplotlib.pyplot as plt


#1.相关系数
#pandas自带的相关系数,是皮尔逊系数,而相关系数有三种pearson spearman kendall
'''
##############################################################
def get_xgxs(n,m):
    global col_list
    global data
    data1 = data[data.days == m]
    a = data1[col_list[n]]
    b = data1.y
    #return a.corr(b)           #在这里设置像关系数方法
    return a.corr(b,method = "kendall")
    
days_list = list(pd.unique(data.days))

#相关系数表
corr_df = pd.DataFrame()

for i in range(1,len(days_list)+1):
    corr_list = []
    for j in range(2,89):
        corr_list.append(get_xgxs(j,i))
    col_name = "days_" + str(i)
    corr_df[col_name] = corr_list
    print(i)
    
corr_df.index=col_list[2:89]
#corr_df.to_csv("日相关系数表_Pearson.csv")

#设计一个方案,研究系数的变异和分布情况,希望找的特征的相关性尽可能稳定
corr_df2 = corr_df.T

c_mean = list(corr_df2.mean())
c_std = list(corr_df2.std())
c_max = list(corr_df2.max())
c_min = list(corr_df2.min())
c_len = list(corr_df2.max() - corr_df2.min())

corr_df["mean"] = c_mean
corr_df["std"] = c_std
corr_df["max"] = c_max
corr_df["min"] = c_min
corr_df["len"] = c_len
corr_df.to_csv("日相关系数表_kendall.csv")
#################################################################
'''
#2.与布尔型数值的相关性

def get_xgxs(n,m):
    global col_list
    global data
    data2 = data[data.days == m]
    data1 = data2.copy()
    data1 = data1.sort_values("y",ascending=False).reset_index(drop=True)
    data1["y_b"] = 0
    data1[0:50]["y_b"] = 1
    a = data1[col_list[n]]
    b = data1.y
    #return a.corr(b)           #在这里设置像关系数方法
    return a.corr(b,method = "spearman")
    
days_list = list(pd.unique(data.days))

#相关系数表
corr_df = pd.DataFrame()

for i in range(1,len(days_list)+1):
    corr_list = []
    for j in range(2,89):
        corr_list.append(get_xgxs(j,i))
    col_name = "days_" + str(i)
    corr_df[col_name] = corr_list
    print(i)
    
corr_df.index=col_list[2:89]
#corr_df.to_csv("日相关系数表_Pearson.csv")

#设计一个方案,研究系数的变异和分布情况,希望找的特征的相关性尽可能稳定
corr_df2 = corr_df.T

c_mean = list(corr_df2.mean())
c_std = list(corr_df2.std())
c_max = list(corr_df2.max())
c_min = list(corr_df2.min())
c_len = list(corr_df2.max() - corr_df2.min())

corr_df["mean"] = c_mean
corr_df["std"] = c_std
corr_df["max"] = c_max
corr_df["min"] = c_min
corr_df["len"] = c_len
corr_df.to_csv("日相关系数表_top50_spearman.csv")
#脚本6,构造模型
import pandas as pd
import numpy as np
data = pd.read_csv("训练数据.csv")
col_list = list(data.columns)

for i in col_list:
    data[i][data[i] == np.inf] = max(data[i][data[i] != np.inf])
    data[i][data[i] == -(np.inf)] = 0 

data = data.fillna(0)
    
#删选特征
#data = data.ix[:,[58,59,60,61,89,90]]

from sklearn import preprocessing
#data = preprocessing.minmax_scale(data, feature_range=(0, 1))


from sklearn import tree
from sklearn import ensemble
from sklearn import neighbors
from sklearn import svm
from sklearn import linear_model
from sklearn import neural_network

def get_tree(n):
    data1 = data[data.days == n]
#    t = tree.DecisionTreeRegressor(max_depth = 2)  #明天试下4层

    t = ensemble.RandomForestRegressor(n_estimators = 50,
                                       max_features = "auto",
                                       max_depth = None,
                                       min_samples_leaf = 50,
                                       bootstrap = True)
    
    #t = neighbors.KNeighborsRegressor(n_neighbors=20)
#    t = svm.SVR()
#    t = linear_model.LinearRegression()
#    t = linear_model.Lasso()
    #t = neural_network.MLPRegressor(hidden_layer_sizes = (5,5,5),
#                                    max_iter = 50)
    x = data1.ix[:,2:89]
#    x = data1.ix[:,0:-2]
    y = data1.ix[:,-1]  
#    x = preprocessing.minmax_scale(x, feature_range=(0, 1))
    t.fit(x,y)
    return t

trees_list = []
days = list(pd.unique(data.days))
for i in days:
    trees_list.append(get_tree(i))
    print(i)


p_data = pd.read_csv("488.csv")
x = p_data.ix[:,2:]
#x = p_data.ix[:,[58,59,60,61]]
points = 0
#qz = 1 
for i in trees_list:
#    qz = qz + 0.01
    points = points + i.predict(x)
    
p_data["p"] = points

out_data = p_data[["code","p"]]
out_data = out_data.sort_values("p",ascending=False).reset_index(drop = True)
out_data = out_data[0:20]
del out_data["p"]
out_data.to_csv("answer08.csv",index=False)


2条评论

分享

2条评论
意见反馈
关注微信公众号 关注微信公众号

扫一扫分享给周围朋友