竞赛圈 > 开源数据统计和baseline 1.45229分
开源一个蛋白质的
包括数据初步统计和一个简单的baseline
1.45229分,目前可以排22名
好久没来DC了
#encoding=utf-8
import pandas as pd
import numpy as np
import sys
df_protein_train = pd.read_csv('./data/df_protein_train.csv')
df_protein_train['seq_len'] = df_protein_train['Sequence'].apply(len)
print "protein_train"
print df_protein_train.head()
print df_protein_train.describe()
df_protein_test = pd.read_csv('./data/df_protein_test.csv')
df_protein_test['seq_len'] = df_protein_test['Sequence'].apply(len)
print "protein_test"
print df_protein_test.head()
print df_protein_test.describe()
df_molecule = pd.read_csv('./data/df_molecule.csv')
df_molecule['Fingerprint_len'] = df_molecule['Fingerprint'].apply(len)
print "molecure"
print df_molecule.head()
print df_molecule.describe()
df_affinity_train = pd.read_csv('./data/df_affinity_train.csv')
print "affinity"
print df_affinity_train.head()
print df_affinity_train.describe()
df_affinity_test = pd.read_csv('./data/df_affinity_test_toBePredicted.csv')
print "affnity toBePredicted"
print df_affinity_test.describe()
df_molecule_avg = df_affinity_train.groupby(['Molecule_ID'],as_index=False)['Ki'].agg({'Ki_avg':'mean'})
print "affinity group by Molecule_ID"
print df_molecule_avg.describe()
df1 = pd.merge(df_affinity_train,df_protein_train, on=["Protein_ID"],how='left')
print "merge df_affinity_train,df_protein_train"
print df1.describe()
df1 = pd.merge(df_affinity_train,df_protein_test, on=["Protein_ID"],how='left')
print "merge df_affinity_train,df_protein_test"
print df1.describe()
df1 = pd.merge(df_affinity_test,df_molecule_avg, on=["Molecule_ID"],how='left')
df1 = df1.fillna(df1.mean()['Ki_avg'])
df1.columns = ['Protein_ID','Molecule_ID','Ki']
df1.to_csv("result.csv",index=False)