Tabular data
! [ -e /content ] && pip install -Uqq fastbook kaggle waterfallcharts treeinterpreter dtreeviz
import fastbook
fastbook.setup_book()
from fastbook import *
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from dtreeviz.trees import *
from IPython.display import Image, display_svg, SVG
pd.options.display.max_rows = 20
pd.options.display.max_columns = 8
path = Path('/root/.fastai/archive/bluebook-for-bulldozers')
Path.BASE_PATH = path
path.ls(file_type='text')
df = pd.read_csv(path/'TrainAndValid.csv', low_memory=False)
df.columns
str(df.ProductSize.unique()[1:])
sizes = 'Large', 'Large / Medium', 'Medium', 'Small', 'Mini', 'Compact'
df.ProductSize = df.ProductSize.astype('category')
df.ProductSize.cat.set_categories(sizes, ordered=True, inplace=True)
df.ProductSize
dep_var = 'SalePrice'
df[dep_var] = np.log(df[dep_var])
len(df.columns)
df = add_datepart(df, 'saledate')
len(df.columns)
' '.join(o for o in df.columns if o.startswith('sale'))
procs = [Categorify, FillMissing]
cond = (df.saleYear<2011) | (df.saleMonth<10)
train_idx = np.where(cond)[0]
valid_idx = np.where(~cond)[0]
splits = (train_idx.tolist(), valid_idx.tolist())
#splits
cont, cat = cont_cat_split(df, 1, dep_var=dep_var)
to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits)
to.classes["ProductSize"]
save_pickle(path/'to.pkl', to)
to = load_pickle(path/'to.pkl')
xs, y = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y
m = DecisionTreeRegressor(max_leaf_nodes=4)
m.fit(xs, y)
draw_tree(m, xs, size=10, leaves_parallel=True, precision=2)
xs.loc[xs.YearMade<1900, 'YearMade'] = 1950
valid_xs.loc[valid_xs.YearMade<1900, 'YearMade'] = 1950
def r_mse(pred,y): return round(math.sqrt(((pred-y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)
m = DecisionTreeRegressor(min_samples_leaf=25).fit(to.train.xs, to.train.y)
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y), m.get_n_leaves()
def rf(xs, y, n_estimators=40, max_samples=200_000,
max_features=0.5, min_samples_leaf=5, **kwargs):
m = RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
max_samples=max_samples, max_features=max_features,
min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)
return m
m = rf(xs, y)
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)
def rf_feat_importance(m, df):
df_ret = pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
).sort_values('imp', ascending=False)
return df_ret
fi = rf_feat_importance(m, xs)
fi[:10]
def plot_fi(fi):
return fi.plot('cols', 'imp', 'barh', figsize=(12, 7), legend=False)
plot_fi(fi[:30])
to_keep = fi[fi.imp>0.005].cols
len(to_keep)
xs_imp = xs[to_keep]
valid_xs_imp = valid_xs[to_keep]
m = rf(xs_imp, y)
m_rmse(m, xs_imp, y), m_rmse(m, valid_xs_imp, valid_y),
plot_fi(rf_feat_importance(m, xs_imp))
cluster_columns(xs_imp)
def get_oob(df):
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=15,
max_samples=50000, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(df, y)
return m.oob_score_
get_oob(xs_imp)
{c:get_oob(xs_imp.drop(c, axis=1)) for c in (
'saleYear', 'saleElapsed')}#, 'ProductGroupDesc','ProductGroup',
# 'fiModelDesc', 'fiBaseModel',
# 'Hydraulics_Flow','Grouser_Tracks', 'Coupler_System')}
to_drop = ['saleYear', 'ProductGroupDesc', 'fiBaseModel', 'Grouser_Tracks']
get_oob(xs_imp.drop(to_drop, axis=1))
xs_final = xs_imp.drop(to_drop, axis=1)
valid_xs_final = valid_xs_imp.drop(to_drop, axis=1)
m = rf(xs_final, y)
m_rmse(m, xs_final, y), m_rmse(m, valid_xs_final, valid_y)
p = valid_xs_final['ProductSize'].value_counts(sort=False).plot.barh()
c = to.classes['ProductSize']
plt.yticks(range(len(c)), c);
ax = valid_xs_final['YearMade'].hist()
from sklearn.inspection import plot_partial_dependence
fig,ax = plt.subplots(figsize=(12, 4))
plot_partial_dependence(m, valid_xs_final, ['YearMade','ProductSize'],
grid_resolution=20, ax=ax);
!pip install -Uqq --user treeinterpreter
!pip install -Uqq --user waterfallcharts
import warnings
warnings.simplefilter('ignore', FutureWarning)
from treeinterpreter import treeinterpreter
from waterfall_chart import plot as waterfall
row = valid_xs_final.iloc[:5]
#row.values.shape
prediction, bias, contributions = treeinterpreter.predict(m, row.values)
prediction[0], bias[0], contributions[0].sum()
waterfall(valid_xs_final.columns, contributions[0], threshold=0.08,
rotation_value=45,formatting='{:,.3f}');
time_vars = ['SalesID','MachineID']
xs_final_time = xs_final.drop(time_vars, axis=1)
valid_xs_time = valid_xs_final.drop(time_vars, axis=1)
df_nn = pd.read_csv(path/'TrainAndValid.csv', low_memory=False)
df_nn['ProductSize'] = df_nn['ProductSize'].astype('category')
df_nn['ProductSize'].cat.set_categories(sizes, ordered=True, inplace=True)
df_nn[dep_var] = np.log(df_nn[dep_var])
df_nn = add_datepart(df_nn, 'saledate')
df_nn_final = df_nn[list(xs_final_time.columns) + [dep_var]]
df_nn_final.columns
cont_nn,cat_nn = cont_cat_split(df_nn_final, max_card=9000, dep_var=dep_var)
cont_nn
df_nn_final[cat_nn].nunique()
cat_nn.remove('fiModelDescriptor')
procs_nn = [Categorify, FillMissing, Normalize]
to_nn = TabularPandas(df_nn_final, procs_nn, cat_nn, cont_nn,
splits=splits, y_names=dep_var)
dls = to_nn.dataloaders(1024)
y = to_nn.train.y
y.min(),y.max()
learn = tabular_learner(dls, y_range=(8,12), layers=[500,250],
n_out=1, loss_func=F.mse_loss)
learn.fit_one_cycle(5, 1e-2)
preds,targs = learn.get_preds()
r_mse(preds,targs)