Simple LightGBM for win prediction

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import gc, sys
gc.enable()
In [2]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

PUBG - это командная игра. Игроки из одной команды получают одинаковое количество очков (и имеют одинаковое значение целевой переменной).
Уовень командного мастерства в этой игре важнее, чем уровень личный уровень. Команда может состоять из игроков разного уровня, но если 1, 2 или более членов команды являются опытными игроками, вы все равно получите высокий балл и, соответственно, высокую вероятность победы.
Исходя их этих соображений, сгруппируем наши данные в разрезе matchId/groupId. Тем самым мы уберём особенности каждого игрока в рамках комманды и получим средние показатели.

In [3]:
def stat_features(train, debug):
    
    test_idx = None
    
    if train:
        if debug:
            df = pd.read_csv('../input/train_V2.csv', nrows=10000)
        else:
            df = pd.read_csv('../input/train_V2.csv') 
            print('train dataset shape: ', df.shape)
    else:
        if debug:
            df = pd.read_csv('../input/test_V2.csv', nrows=10000)
            test_idx = df.Id
        else:
            df = pd.read_csv('../input/test_V2.csv') 
            print('test dataset shape: ', df.shape)
            test_idx = df.Id
            
    if train:
        df.drop(df[df['winPlacePerc'].isnull()].index.values, inplace=True)
    
    print("remove some columns")
    target = 'winPlacePerc'
        
    print('get new features')
    df['headshotrate'] = df['kills']/df['headshotKills']
    df['killStreakrate'] = df['killStreaks']/df['kills']
    df['healthitems'] = df['heals'] + df['boosts']
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    df['headshotKills_over_kills'] = df['headshotKills'] / df['kills']
    df['distance_over_weapons'] = df['totalDistance'] / df['weaponsAcquired']
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_over_kills'] = df['walkDistance'] / df['kills']
    df['killsPerWalkDistance'] = df['kills'] / df['walkDistance']
    df["kill_skill"] = df["headshotKills"] + df["roadKills"]
    
    df[df == np.Inf] = np.NaN
    df[df == np.NINF] = np.NaN
    df.fillna(0, inplace=True)
    gc.collect()
    
    
    features = list(df.columns)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchType")
    
    y = None
    
    if train: 
        print("get target")
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64)
        features.remove(target)

    print("get group mean feature")
    df_agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    if train: df_out = df_agg.reset_index()[['matchId','groupId']]
    else: df_out = df[['matchId','groupId']]

    df_out = df_out.merge(df_agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
    
    print("get group max feature")
    df_agg = df.groupby(['matchId','groupId'])[features].agg('max')
    agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(df_agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    print("get group min feature")
    df_agg = df.groupby(['matchId','groupId'])[features].agg('min')
    agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(df_agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
    print("get group size feature")
    df_agg = df.groupby(['matchId','groupId']).size().reset_index(name='team_size')
    df_out = df_out.merge(df_agg, how='left', on=['matchId', 'groupId'])
    
    print("get match mean feature")
    df_agg = df.groupby(['matchId'])[features].agg('mean').add_suffix('_match_mean').reset_index()
    df_out = df_out.merge(df_agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    print("get match size feature")
    df_agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    df_out = df_out.merge(df_agg, how='left', on=['matchId'])
    
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)

    X = df_out

    del df, df_out, df_agg, agg_rank
    gc.collect()

    print('Complited!')
    return X, y, test_idx

Из-за ограничений по ресурсам загружаем и обрабатываем выборки последовательно, с удалением неиспользуемых датасетов.

In [4]:
#                       train, debug
X, y, _ = stat_features(True, False)
train dataset shape:  (4446966, 29)
remove some columns
get new features
get target
get group mean feature
get group max feature
get group min feature
get group size feature
get match mean feature
get match size feature
Complited!

Помимо стандартных режимов игры (solo, duo, squad) в игре есть зомби режим. Суть его заключается в том, что 4 "выживших" противостоят нашествию зомби (которые являются обычными игроками, но без огнестрельного оружия). Выделим этот режим отдельно:

In [ ]:
X['zombie_mode'] = np.where((X['team_size']>4), 1, 0)

Теперь выделим нечестных игроков:

In [5]:
X['cheater'] = np.where((X['kills_mean'] > 50)| \
                   (X['longestKill_mean'] > 1000)| \
                   (X['rideDistance_mean'] > 20000)| \
                   (X['swimDistance_mean'] > 2000)| \
                   (X['walkDistance_mean'] > 10000)| \
                   (X['weaponsAcquired_mean'] > 80)| \
                   (X['heals_mean'] > 40), 1, 0)

X[X == np.Inf] = np.NaN
X[X == np.NINF] = np.NaN
X.fillna(0, inplace=True)

gc.collect()

X = reduce_mem_usage(X)
Memory usage of dataframe is 4053488000.00 MB
Memory usage after optimization is: 952569680.00 MB
Decreased by 76.5%
In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
gc.collect()
/opt/conda/lib/python3.6/site-packages/sklearn/preprocessing/data.py:625: DataConversionWarning: Data with input dtype int8, float16, int16 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
/opt/conda/lib/python3.6/site-packages/sklearn/base.py:462: DataConversionWarning: Data with input dtype int8, float16, int16 were all converted to float64 by StandardScaler.
  return self.fit(X, **fit_params).transform(X)
Out[6]:
14
In [7]:
import os
import time
import gc
import warnings
warnings.filterwarnings("ignore")

Правильнее всего было бы тестировать алгоритм на кросс-валидации, но для экономии времени будем проверять ошибку на отложенной выборке:

In [8]:
indx = round(int(X.shape[0]*0.8))
X_train = X[:indx] 
X_hold = X[indx:]
y_train = y[:indx] 
y_hold = y[indx:] 
In [9]:
del X
gc.collect()
Out[9]:
0
In [10]:
import time

Попробуем предсказать победителя с помощью алгоритма LightGBM

В качестве метрики используем среднюю абсолютную ошибку (одно из условий соревнования).
Про тюнинг параметров lgbm можно почитать тут: https://medium.com/@pushkarmandot/https-medium-com-pushkarmandot-what-is-lightgbm-how-to-implement-it-how-to-fine-tune-the-parameters-60347819b7fc

In [11]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

start = time.time()

params = {"objective" : "regression", "metric" : "mae", 'n_estimators':20000, 'early_stopping_rounds':200,
          "num_leaves" : 31, "learning_rate" : 0.05, "bagging_fraction" : 0.7, "feature_fraction": 0.7,
           "bagging_freq": 1, "bagging_seed" : 0, "num_threads" : 4, "save_binary": "true"
         }

lgtrain = lgb.Dataset(X_train, label=y_train)
lgval = lgb.Dataset(X_hold, label=y_hold)
model = lgb.train(params, lgtrain, valid_sets=[lgtrain, lgval], early_stopping_rounds=200, verbose_eval=1000)

end = time.time()
print("Take Time :",(end-start))
Training until validation scores don't improve for 200 rounds.
[1000]	training's l1: 0.0283482	valid_1's l1: 0.0289277
[2000]	training's l1: 0.027135	valid_1's l1: 0.0282553
[3000]	training's l1: 0.0263165	valid_1's l1: 0.0279418
[4000]	training's l1: 0.0256753	valid_1's l1: 0.0277811
[5000]	training's l1: 0.0251113	valid_1's l1: 0.0276618
[6000]	training's l1: 0.0246012	valid_1's l1: 0.0275821
[7000]	training's l1: 0.0241391	valid_1's l1: 0.0275233
[8000]	training's l1: 0.0237033	valid_1's l1: 0.0274777
[9000]	training's l1: 0.0232924	valid_1's l1: 0.0274346
[10000]	training's l1: 0.0229071	valid_1's l1: 0.0274025
[11000]	training's l1: 0.0225349	valid_1's l1: 0.027372
[12000]	training's l1: 0.0221874	valid_1's l1: 0.0273526
[13000]	training's l1: 0.0218333	valid_1's l1: 0.027327
[14000]	training's l1: 0.0215066	valid_1's l1: 0.0273069
[15000]	training's l1: 0.0211909	valid_1's l1: 0.0272944
[16000]	training's l1: 0.0208842	valid_1's l1: 0.027281
Early stopping, best iteration is:
[16282]	training's l1: 0.0207999	valid_1's l1: 0.0272787
Take Time : 5504.366351366043
In [12]:
del X_train, X_hold, y_train, y_hold
gc.collect()
Out[12]:
34
In [13]:
#                                    train, debug
X_test, _, test_idx = stat_features(False, False)
test dataset shape:  (1934174, 28)
remove some columns
get new features
get group mean feature
get group max feature
get group min feature
get group size feature
get match mean feature
get match size feature
Complited!
In [14]:
X_test = reduce_mem_usage(X_test)
X_test = scaler.fit_transform(X_test)
Memory usage of dataframe is 3837401216.00 MB
Memory usage after optimization is: 903259258.00 MB
Decreased by 76.5%

Предсказываем зачение winPlacePerc используя лучший результат на отложенной выборке:

In [15]:
pred_test = model.predict(X_test, num_iteration=model.best_iteration)
In [16]:
del X_test, test_idx
gc.collect()
Out[16]:
14

Наконец, сопоставляем наши предсказания со списком игроков и отправляем полученый результат на проверку:

In [17]:
df_sub = pd.read_csv("../input/sample_submission_V2.csv")
df_test = pd.read_csv("../input/test_V2.csv")
df_sub['winPlacePerc'] = pred_test
# Restore some columns
df_sub = df_sub.merge(df_test[["Id", "matchId", "groupId", "maxPlace", "numGroups"]], on="Id", how="left")

# Sort, rank, and assign adjusted ratio
df_sub_group = df_sub.groupby(["matchId", "groupId"]).first().reset_index()
df_sub_group["rank"] = df_sub_group.groupby(["matchId"])["winPlacePerc"].rank()
df_sub_group = df_sub_group.merge(
    df_sub_group.groupby("matchId")["rank"].max().to_frame("max_rank").reset_index(), 
    on="matchId", how="left")
df_sub_group["adjusted_perc"] = (df_sub_group["rank"] - 1) / (df_sub_group["numGroups"] - 1)

df_sub = df_sub.merge(df_sub_group[["adjusted_perc", "matchId", "groupId"]], on=["matchId", "groupId"], how="left")
df_sub["winPlacePerc"] = df_sub["adjusted_perc"]

# Deal with edge cases
df_sub.loc[df_sub.maxPlace == 0, "winPlacePerc"] = 0
df_sub.loc[df_sub.maxPlace == 1, "winPlacePerc"] = 1

# Align with maxPlace
subset = df_sub.loc[df_sub.maxPlace > 1]
gap = 1.0 / (subset.maxPlace.values - 1)
new_perc = np.around(subset.winPlacePerc.values / gap) * gap
df_sub.loc[df_sub.maxPlace > 1, "winPlacePerc"] = new_perc

# Edge case
df_sub.loc[(df_sub.maxPlace > 1) & (df_sub.numGroups == 1), "winPlacePerc"] = 0
assert df_sub["winPlacePerc"].isnull().sum() == 0

df_sub[["Id", "winPlacePerc"]].to_csv("submission_6.csv", index=False)