竞赛圈   > baseline---公众号:麻婆豆腐AI出品

麻婆豆腐

吹牛   |   吹牛
  • 关注者 14
  • 关注了

麻婆豆腐

吹牛   |   吹牛
'''
比赛数据(脱敏后)抽取的是一段时间范围内,客户的购买行为数据,初赛和复赛两个阶段所提供样本的量级有所不同。
参赛选手根据特征字段信息进行建模,预测所有已购客户在未来180天的购买概率。
欢迎关注 公众号:麻婆豆腐AI

以下代码简单测试了一下
分别用了5个特征和6个特征,其中,
5个特征线下0.41xx 线上 1.07
6个特征线下0.41xx 线上 1.05
剩下的特征大家自己提取吧
欢迎关注 公众号:麻婆豆腐AI 后续公众号可能会有更新,顺便关注一下bilibili : https://www.bilibili.com/video/av65398865 ,up主麻婆豆腐_奏
这个id的奏,是angle beat女主,立华奏的奏,她最喜欢吃麻婆豆腐
'''
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import datetime
import warnings
warnings.filterwarnings('ignore')
# 这个代码是,看看具体主办方的 deta 大概的取值,应该是2-5之间的一个值,可能是一个自然数e
def logloss(y_true, y_pred,deta = 3, eps=1e-15):
    # Prepare numpy array data
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    assert (len(y_true) and len(y_true) == len(y_pred))
    # Clip y_pred between eps and 1-eps
    p = np.clip(y_pred, eps, 1-eps)
    loss = np.sum(- y_true * np.log(p) * deta - (1 - y_true) * np.log(1-p))
    return loss / len(y_true)

# 读取原始数据
trian = pd.read_csv('./round1_diac2019_train.csv',low_memory=False)
all_customer = pd.DataFrame(trian[['customer_id']]).drop_duplicates(['customer_id']).dropna()
print(all_customer.shape)
print('train date gap',trian.order_pay_time.max(),trian.order_pay_time.min())

'''
提供了2013年全年的数据,2013-12-31 23:59:44 2013-01-01 00:00:18
'''
trian['order_pay_time'] = pd.to_datetime(trian['order_pay_time'])
trian['order_pay_date'] = trian['order_pay_time'].dt.date
validata_date_begin = trian['order_pay_date'].max() - datetime.timedelta(days=180)

train_history = trian[(trian['order_pay_date'].astype(str)<='2013-07-03')]
online_history = trian[(trian['order_pay_date'].astype(str)<='2013-12-31')]
# train_label 相对于 train_history 的未来180天的数据
train_label = trian[trian['order_pay_date'].astype(str)>='2013-07-04']

# 简单的特征生成部分代码
def make_feature_and_label(date1,date2,isSubmit):
    date1['count'] = 1
    # 统计这个用户出现了多少次
    customer_id = date1.groupby(['customer_id'],as_index=False)['count'].agg({'count':'count'})
    # 统计这个用户购买商品的价格信息
    good_price = date1.groupby(['customer_id'],as_index=False)['goods_price'].agg({'goods_price_max':'max',
                                                                                    'goods_price_min':'min',
                                                                                    'goods_price_mean':'mean'})
    # 统计这个用户的订单最后一次购买时间
    last_time = date1.groupby(['customer_id'],as_index=False)['order_pay_date'].agg({'order_pay_date_last':'max','order_pay_date_first':'min'})
    # 当然这里面还可以构造更多的特征
    data = pd.merge(customer_id,good_price,on=['customer_id'],how='left',copy=False)
    data = pd.merge(data,last_time,on=['customer_id'],how='left',copy=False)
    data['long_time'] = pd.to_datetime(data['order_pay_date_last']) - pd.to_datetime(data['order_pay_date_first'])
    data['long_time'] = data['long_time'].dt.days + 1
    del data['order_pay_date_first']
    if isSubmit==False:
        data['order_pay_date_last'] = pd.to_datetime(date2['order_pay_date'].min()) - pd.to_datetime(data['order_pay_date_last'])
        data['order_pay_date_last'] = data['order_pay_date_last'].dt.days + 1
        data['label'] = 0
        data.loc[data['customer_id'].isin(list(date2['customer_id'].unique())),'label'] = 1
        print(data['label'].mean())
    else:
        data['order_pay_date_last'] = pd.to_datetime('2013-12-31') - pd.to_datetime(data['order_pay_date_last'])
        data['order_pay_date_last'] = data['order_pay_date_last'].dt.days + 1
    print(data.shape)
    return data

# 生成训练数据和提交数据
train = make_feature_and_label(train_history,train_label,False)
submit = make_feature_and_label(online_history,None,True)
# 参数
param = {
    'num_leaves':128,
    'objective':'binary',
    'max_depth':-1,
    'learning_rate':0.1,
    'metric':'binary_logloss'}
# 构建机器学习所需的label和data
y = train.pop('label')
feature = [x for x in train.columns if x not in ['customer_id']]
X = train[feature]
# 划分训练集和验证集
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42,stratify=y)

submit_df = submit[['customer_id']]
X_submit = submit[feature]
#
trn_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_valid, label=y_valid)
lgbm = lgb.train(param,trn_data,valid_sets=[trn_data,val_data],num_boost_round = 10000 ,early_stopping_rounds=25,verbose_eval=50)
y_submit = lgbm.predict(X_submit)

submit_df['result'] = y_submit

all_customer = pd.merge(all_customer,submit_df,on=['customer_id'],how='left',copy=False)
all_customer = all_customer.sort_values(['customer_id'])
all_customer['customer_id'] = all_customer['customer_id'].astype('int64')
all_customer['result'] = all_customer['result'].fillna(0)
all_customer.to_csv('./mpdf_baseline.csv',index=False)


3条评论

分享

3条评论
意见反馈
关注微信公众号 关注微信公众号

扫一扫分享给周围朋友