竞赛圈   > Seaty-朴素贝叶斯baseline-0.42

OlaOlaOlaOlaOla

算法工程师   |   Python
  • 关注者
  • 关注了 1

OlaOlaOlaOlaOla

算法工程师   |   Python
import pandas as pd
import numpy as np
from math import radians, atan, tan, sin, acos, cos
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import paired_distances
import gc


train = pd.read_csv('../data/train_new.csv', low_memory=False)
test = pd.read_csv('../data/test_new.csv', low_memory=False)
try:
    test.drop(['end_time', 'end_lat', 'end_lon'], axis=1, inplace=True)
except:
    pass
# #############################################################################
trL = train.shape[0] * 2
X = np.concatenate([train[['start_lat', 'start_lon']].values,
                    train[['end_lat', 'end_lon']].values])
# #############################################################################
# 计算密度聚类(DBSCAN)
db = DBSCAN(eps=5e-4, min_samples=2, p=1, leaf_size=10, n_jobs=-1).fit(X)
labels = db.labels_
train['start_block'] = labels[:trL//2]
train['end_block'] = labels[trL//2:trL]

start_cols = ['start_block', 'start_lat', 'start_lon']
end_cols = ['end_block', 'end_lat', 'end_lon']
start_rename_cols = columns={'start_block': 'block', 'start_lat': 'lat', 'start_lon': 'lon'}
end_rename_cols = columns={'end_block': 'block', 'end_lat': 'lat', 'end_lon': 'lon'}
block_id_latlon_info = pd.concat([train[start_cols].rename(columns=start_rename_cols),
                                  train[end_cols].rename(columns=end_rename_cols)]).reset_index(drop=True)
block_id_latlon_info = block_id_latlon_info.loc[block_id_latlon_info.block != -1, :]


Y = block_id_latlon_info[['lat', 'lon', 'block']].values
Len_Y = Y.shape[0]
X = test[['start_lat', 'start_lon']].values
print(Y.shape, X.shape)
# 生成测试集block id
thres = 1.5e-3
min_ditance_block_id = np.zeros(X.shape[0], dtype=int)
for idx, x in enumerate(X):
    if idx and idx % 5000 == 0:
        print(idx)
        gc.collect()
        
    distances0 = np.abs(Y[:, 0] - x[0])
    sub_Y = Y[distances0<thres, :]
    distances1 = np.abs(sub_Y[:, 1] - x[1])
    sub_Y = sub_Y[distances1<thres, :]
    if len(sub_Y):
        distances0 = np.abs(sub_Y[:, 0] - x[0])
        distances1 = np.abs(sub_Y[:, 1] - x[1])
        distances = distances1 + distances0
        min_distance = distances.min()
#         print(min_distance)
        if min_distance < thres:
            min_distance_idx = distances.argmin()
            min_ditance_block_id[idx] = sub_Y[min_distance_idx, 2]
        else:
            min_ditance_block_id[idx] = -1
    else:
        min_ditance_block_id[idx] = -1
        
test['start_block'] = min_ditance_block_id
good_train_idx = (train.end_block != -1) #& (train.start_block != -1)
good_train = train.loc[good_train_idx, :]
print('saving new train & test data')
good_train.to_csv('../data/good_train.csv', index=None)
test.to_csv('../data/good_test.csv', index=None)
# 多了一个格子不知道怎么删掉 ^_^
# 为训练集和测试集生成is_holiday 和 hour字段
def transformer(df):
    special_holiday = ['2018-01-01'] + ['2018-02-%d' % d for d in range(15, 22)] + \
                      ['2018-04-%2d' % d for d in range(5, 8)] + \
                      ['2018-04-%d' % d for d in range(29, 31)] + ['2018-05-01'] +\
                      ['2018-06-%d' % d for d in range(16, 19)] + \
                      ['2018-09-%d' % d for d in range(22, 25)] + \
                      ['2018-10-%2d' % d for d in range(1, 8)]
    special_workday = ['2018-02-%d' % d for d in [11, 24]] + \
                      ['2018-04-08'] + ['2018-04-28'] + \
                      ['2018-09-%d' % d for d in range(29, 31)]
    for t_col in ['start_time']:
        tmp = df[t_col].map(pd.Timestamp)
        df['hour'] = tmp.map(lambda t: t.hour // 3)
        df['half'] = tmp.map(lambda t: t.minute // 30)
        df['day'] = tmp.map(lambda t: t.dayofweek)
        tmp_date = df[t_col].map(lambda s: s.split(' ')[0])
        not_spworkday_idx = ~tmp_date.isin(special_workday)
        spholiday_idx = tmp_date.isin(special_holiday)
        weekend_idx = (df['day'] >= 5)
        df['is_holiday'] = ((weekend_idx & not_spworkday_idx) | spholiday_idx).astype(int)

train = pd.read_csv('../data/good_train.csv', low_memory=False)
test = pd.read_csv('../data/good_test.csv', low_memory=False)
transformer(train)
transformer(test)
# 根据训练集 计算朴素贝叶斯算法需要使用的 条件概率
Probability = {}
smooth_y = 10.
smooth_x = 0.
## P(start_block|end_block)
name = 'start_block'
pname = 'P(start_block|end_block)'
print('calculating %s' % pname)
dy = train.groupby('end_block', as_index=False)['r_key'].count().rename(columns={'r_key': 'y'})
dx = train.groupby(['end_block', name], as_index=False)['r_key'].count().rename(columns={'r_key': 'x'})
dxy = dx.merge(dy, on='end_block', how='left')
dxy[pname] = (dxy.x + smooth_x) / (dxy.y.astype(float) + smooth_y)
Probability[pname] = dxy[['end_block', name, pname]]
## P(out_id|end_block)
name = 'out_id'
pname = 'P(out_id|end_block)'
print('calculating %s' % pname)
dy = train.groupby('end_block', as_index=False)['r_key'].count().rename(columns={'r_key': 'y'})
dx = train.groupby(['end_block', name], as_index=False)['r_key'].count().rename(columns={'r_key': 'x'})
dxy = dx.merge(dy, on='end_block', how='left')
dxy[pname] = (dxy.x + smooth_x) / (dxy.y.astype(float) + smooth_y)
Probability[pname] = dxy[['end_block', name, pname]]
## P(is_holiday|end_block)
name = 'is_holiday'
pname = 'P(is_holiday|end_block)'
print('calculating %s' % pname)
dy = train.groupby('end_block', as_index=False)['r_key'].count().rename(columns={'r_key': 'y'})
dx = train.groupby(['end_block', name], as_index=False)['r_key'].count().rename(columns={'r_key': 'x'})
dxy = dx.merge(dy, on='end_block', how='left')
dxy[pname] = (dxy.x + smooth_x) / (dxy.y.astype(float) + smooth_y)
Probability[pname] = dxy[['end_block', name, pname]]
## P((is_holiday, hour)|end_block)
pname = 'P((is_holiday, hour)|end_block)'
name = ['is_holiday', 'hour']
print('calculating %s' % pname)
dy = train.groupby('end_block', as_index=False)['r_key'].count().rename(columns={'r_key': 'y'})
dx = train.groupby(['end_block'] + name, as_index=False)['r_key'].count().rename(columns={'r_key': 'x'})
dxy = dx.merge(dy, on='end_block', how='left')
dxy[pname] = (dxy.x + smooth_x) / (dxy.y.astype(float) + smooth_y)
Probability[pname] = dxy[['end_block', pname] + name]
## P(day|end_block)
name = 'day'
pname = 'P(day|end_block)'
print('calculating %s' % pname)
dy = train.groupby('end_block', as_index=False)['r_key'].count().rename(columns={'r_key': 'y'})
dx = train.groupby(['end_block', name], as_index=False)['r_key'].count().rename(columns={'r_key': 'x'})
dxy = dx.merge(dy, on='end_block', how='left')
dxy[pname] = (dxy.x + smooth_x) / (dxy.y.astype(float) + smooth_y)
Probability[pname] = dxy[['end_block', name, pname]]
## P(hour|end_block)
name = 'hour'
pname = 'P(hour|end_block)'
print('calculating %s' % pname)
tmp_func = lambda g: 1.0 * g[name].value_counts() / len(g)
tmp = train.groupby('end_block').apply(tmp_func).reset_index()
tmp.columns = ['end_block', name, pname]
Probability[pname] = tmp
## P((hour,half)|end_block)
pname = 'P((hour,half)|end_block)'
print('calculating %s' % pname)
dy = train.groupby('end_block', as_index=False)['r_key'].count().rename(columns={'r_key': 'y'})
dx = train.groupby(['end_block', name], as_index=False)['r_key'].count().rename(columns={'r_key': 'x'})
dxy = dx.merge(dy, on='end_block', how='left')
dxy[pname] = (dxy.x + smooth_x) / (dxy.y.astype(float) + smooth_y)
Probability[pname] = dxy[['end_block', name, pname]]
# 根据训练集 计算先验概率
pname = 'P(end_block)'
print('calculating %s' % pname)
tmp = train.end_block.value_counts().reset_index()
tmp.columns = ['end_block', pname]
Probability[pname] = tmp
## 计算后验概率 
## P(end_block|(start_block, out_id, is_holiday, hour)) = P(end_block) *
##                         P(start_block|end_block) * P(out_id|end_block) * P((is_holiday, hour)|end_block)
is_local = False  # 是否线下验证
if is_local:
    predict_info = train.copy()
    predict_info = predict_info.rename(columns={'end_block': 'true_end_block', 'end_lat': 'true_end_lat', 'end_lon': 'true_end_lon'})
else:
    predict_info = test.copy()
##
predict_info = predict_info.merge(Probability['P(out_id|end_block)'], on='out_id', how='left')
print(predict_info['P(out_id|end_block)'].isnull().sum())
predict_info['P(out_id|end_block)'] = predict_info['P(out_id|end_block)'].fillna(1e-5)
##
predict_info = predict_info.merge(Probability['P(is_holiday|end_block)'], on=['is_holiday', 'end_block'], how='left')
print(predict_info['P(is_holiday|end_block)'].isnull().sum())
predict_info['P(is_holiday|end_block)'] = predict_info['P(is_holiday|end_block)'].fillna(1e-4)
##
predict_info = predict_info.merge(Probability['P(day|end_block)'], on=['day', 'end_block'], how='left')
print(predict_info['P(day|end_block)'].min(), predict_info['P(day|end_block)'].isnull().sum())
predict_info['P(day|end_block)'] = predict_info['P(day|end_block)'].fillna(1e-4)
##
predict_info = predict_info.merge(Probability['P((is_holiday, hour)|end_block)'], on=['is_holiday', 'hour', 'end_block'], how='left')
print(predict_info['P((is_holiday, hour)|end_block)'].isnull().sum())
predict_info['P((is_holiday, hour)|end_block)'] = predict_info['P((is_holiday, hour)|end_block)'].fillna(1e-4)
##
predict_info = predict_info.merge(Probability['P(start_block|end_block)'], on=['start_block', 'end_block'], how='left')
print(predict_info['P(start_block|end_block)'].isnull().sum())
predict_info['P(start_block|end_block)'] = predict_info['P(start_block|end_block)'].fillna(1e-5)
##
predict_info = predict_info.merge(Probability['P(end_block)'], on='end_block', how='left')
print(predict_info['P(end_block)'].isnull().sum())
predict_info['P(end_block)'] = predict_info['P(end_block)'].fillna(1e-1)
predict_info['P(end_block|(start_block, out_id, is_holiday, hour))'] = predict_info['P((is_holiday, hour)|end_block)'] * \
                                                    predict_info['P(out_id|end_block)'] * \
                                                    predict_info['P(start_block|end_block)'] * \
                                                    predict_info['P(end_block)']
which_probability = 'P(end_block|(start_block, out_id, is_holiday, hour))'
# 生成每个聚类label的经纬度
block_lat_lon = train.groupby('end_block')[['end_lat', 'end_lon']].mean().reset_index()
predict_info = predict_info.merge(block_lat_lon, on='end_block', how='left')
print(predict_info[['start_lat', 'start_lon', 'end_lat', 'end_lon']].describe())
predict_result = predict_info.groupby('r_key').apply(lambda g: g.loc[g[which_probability].idxmax(), :]).reset_index(drop=True)
if not is_local:
    output_result = test[['r_key', 'start_lat', 'start_lon']].merge(predict_result[['r_key', 'end_lat', 'end_lon']], on='r_key', how='left')
    print(output_result.end_lat.isnull().sum())
    # 冷启动暂时用其实经纬度作为预测结果 
    nan_idx = output_result.end_lat.isnull()
    output_result.loc[nan_idx, 'end_lat'] = output_result['start_lat'][nan_idx]
    output_result.loc[nan_idx, 'end_lon'] = output_result['start_lon'][nan_idx]
    #output_result[['start_lat', 'end_lat', 'end_lon']].describe()
    print(output_result.head())
    print(output_result.info())
    output_result[['r_key', 'end_lat', 'end_lon']].to_csv('../result/bayes.csv', index=None)


5条评论

分享

5条评论
意见反馈
关注微信公众号 关注微信公众号
  • © 2013-2018 DataCastle 蜀ICP备17028166号-1

扫一扫分享给周围朋友