! [ -e /content ] && pip install -Uqq fastbook kaggle waterfallcharts treeinterpreter dtreeviz
import fastbook
fastbook.setup_book()
from fastbook import *
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from dtreeviz.trees import *
from IPython.display import Image, display_svg, SVG

pd.options.display.max_rows = 20
pd.options.display.max_columns = 8
path = Path('/root/.fastai/archive/bluebook-for-bulldozers')
Path.BASE_PATH = path
path.ls(file_type='text')
(#7) [Path('Machine_Appendix.csv'),Path('TrainAndValid.csv'),Path('random_forest_benchmark_test.csv'),Path('Test.csv'),Path('median_benchmark.csv'),Path('ValidSolution.csv'),Path('Valid.csv')]
df = pd.read_csv(path/'TrainAndValid.csv', low_memory=False)
df.columns
Index(['SalesID', 'SalePrice', 'MachineID', 'ModelID', 'datasource',
       'auctioneerID', 'YearMade', 'MachineHoursCurrentMeter', 'UsageBand',
       'saledate', 'fiModelDesc', 'fiBaseModel', 'fiSecondaryDesc',
       'fiModelSeries', 'fiModelDescriptor', 'ProductSize',
       'fiProductClassDesc', 'state', 'ProductGroup', 'ProductGroupDesc',
       'Drive_System', 'Enclosure', 'Forks', 'Pad_Type', 'Ride_Control',
       'Stick', 'Transmission', 'Turbocharged', 'Blade_Extension',
       'Blade_Width', 'Enclosure_Type', 'Engine_Horsepower', 'Hydraulics',
       'Pushblock', 'Ripper', 'Scarifier', 'Tip_Control', 'Tire_Size',
       'Coupler', 'Coupler_System', 'Grouser_Tracks', 'Hydraulics_Flow',
       'Track_Type', 'Undercarriage_Pad_Width', 'Stick_Length', 'Thumb',
       'Pattern_Changer', 'Grouser_Type', 'Backhoe_Mounting', 'Blade_Type',
       'Travel_Controls', 'Differential_Type', 'Steering_Controls'],
      dtype='object')
str(df.ProductSize.unique()[1:])
"['Medium' 'Small' 'Large / Medium' 'Mini' 'Large' 'Compact']"
sizes = 'Large', 'Large / Medium', 'Medium', 'Small',  'Mini', 'Compact'
df.ProductSize = df.ProductSize.astype('category')
df.ProductSize.cat.set_categories(sizes, ordered=True, inplace=True)
df.ProductSize
/root/mambaforge/lib/python3.9/site-packages/pandas/core/arrays/categorical.py:2747: FutureWarning: The `inplace` parameter in pandas.Categorical.set_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  res = method(*args, **kwargs)
0            NaN
1         Medium
2            NaN
3          Small
4            NaN
           ...  
412693      Mini
412694      Mini
412695      Mini
412696      Mini
412697      Mini
Name: ProductSize, Length: 412698, dtype: category
Categories (6, object): ['Large' < 'Large / Medium' < 'Medium' < 'Small' < 'Mini' < 'Compact']
dep_var = 'SalePrice'
df[dep_var] = np.log(df[dep_var])
len(df.columns)
53
df = add_datepart(df, 'saledate')
len(df.columns)
65
' '.join(o for o in df.columns if o.startswith('sale'))
'saleYear saleMonth saleWeek saleDay saleDayofweek saleDayofyear saleIs_month_end saleIs_month_start saleIs_quarter_end saleIs_quarter_start saleIs_year_end saleIs_year_start saleElapsed'
procs = [Categorify, FillMissing]
cond = (df.saleYear<2011) | (df.saleMonth<10)
train_idx = np.where(cond)[0]
valid_idx = np.where(~cond)[0]

splits = (train_idx.tolist(), valid_idx.tolist())
#splits
cont, cat = cont_cat_split(df, 1, dep_var=dep_var)
to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits)
to.classes["ProductSize"]
['#na#', 'Large', 'Large / Medium', 'Medium', 'Small', 'Mini', 'Compact']
save_pickle(path/'to.pkl', to)
to = load_pickle(path/'to.pkl')
xs, y = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y
m = DecisionTreeRegressor(max_leaf_nodes=4)
m.fit(xs, y)
draw_tree(m, xs, size=10, leaves_parallel=True, precision=2)
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> Tree 0 Coupler_System ≤ 0.5 squared_error = 0.48 samples = 404710 value = 10.1 1 YearMade ≤ 1991.5 squared_error = 0.42 samples = 360847 value = 10.21 0->1 True 2 squared_error = 0.12 samples = 43863 value = 9.21 0->2 False 3 squared_error = 0.37 samples = 155724 value = 9.97 1->3 4 ProductSize ≤ 4.5 squared_error = 0.37 samples = 205123 value = 10.4 1->4 5 squared_error = 0.31 samples = 182403 value = 10.5 4->5 6 squared_error = 0.17 samples = 22720 value = 9.62 4->6
xs.loc[xs.YearMade<1900, 'YearMade'] = 1950
valid_xs.loc[valid_xs.YearMade<1900, 'YearMade'] = 1950
def r_mse(pred,y): return round(math.sqrt(((pred-y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)
m = DecisionTreeRegressor(min_samples_leaf=25).fit(to.train.xs, to.train.y)
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y), m.get_n_leaves()
(0.248595, 0.323441, 12397)
def rf(xs, y, n_estimators=40, max_samples=200_000,
      max_features=0.5, min_samples_leaf=5, **kwargs):
    m = RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
                              max_samples=max_samples, max_features=max_features,
                              min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)
    return m
m = rf(xs, y)
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)
(0.170917, 0.232626)
def rf_feat_importance(m, df):
    df_ret = pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                         ).sort_values('imp', ascending=False)
    return df_ret
fi = rf_feat_importance(m, xs)
fi[:10]
cols imp
57 YearMade 0.173388
6 ProductSize 0.110295
30 Coupler_System 0.100494
7 fiProductClassDesc 0.069326
54 ModelID 0.055765
65 saleElapsed 0.050687
32 Hydraulics_Flow 0.048636
3 fiSecondaryDesc 0.047010
31 Grouser_Tracks 0.044165
1 fiModelDesc 0.031284
def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12, 7), legend=False)
plot_fi(fi[:30])
<AxesSubplot:ylabel='cols'>
to_keep = fi[fi.imp>0.005].cols
len(to_keep)
22
xs_imp = xs[to_keep]
valid_xs_imp = valid_xs[to_keep]
m = rf(xs_imp, y)
m_rmse(m, xs_imp, y), m_rmse(m, valid_xs_imp, valid_y), 
(0.180775, 0.23147)
plot_fi(rf_feat_importance(m, xs_imp))
<AxesSubplot:ylabel='cols'>
cluster_columns(xs_imp)
def get_oob(df):
    m = RandomForestRegressor(n_estimators=40, min_samples_leaf=15,
                              max_samples=50000, max_features=0.5, n_jobs=-1, oob_score=True)
    m.fit(df, y)
    return m.oob_score_

get_oob(xs_imp)
0.8778685460797769
{c:get_oob(xs_imp.drop(c, axis=1)) for c in (
    'saleYear', 'saleElapsed')}#, 'ProductGroupDesc','ProductGroup',
#    'fiModelDesc', 'fiBaseModel',
#    'Hydraulics_Flow','Grouser_Tracks', 'Coupler_System')}
{'saleYear': 0.8761478210481896, 'saleElapsed': 0.872101137022772}
to_drop = ['saleYear', 'ProductGroupDesc', 'fiBaseModel', 'Grouser_Tracks']
get_oob(xs_imp.drop(to_drop, axis=1))
0.874826620033826
xs_final = xs_imp.drop(to_drop, axis=1)
valid_xs_final = valid_xs_imp.drop(to_drop, axis=1)
m = rf(xs_final, y)
m_rmse(m, xs_final, y), m_rmse(m, valid_xs_final, valid_y)
(0.182696, 0.233411)
p = valid_xs_final['ProductSize'].value_counts(sort=False).plot.barh()
c = to.classes['ProductSize']
plt.yticks(range(len(c)), c);
ax = valid_xs_final['YearMade'].hist()
from sklearn.inspection import plot_partial_dependence

fig,ax = plt.subplots(figsize=(12, 4))
plot_partial_dependence(m, valid_xs_final, ['YearMade','ProductSize'],
                        grid_resolution=20, ax=ax);
/root/mambaforge/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_partial_dependence is deprecated; Function `plot_partial_dependence` is deprecated in 1.0 and will be removed in 1.2. Use PartialDependenceDisplay.from_estimator instead
  warnings.warn(msg, category=FutureWarning)
!pip install -Uqq --user treeinterpreter
!pip install -Uqq --user waterfallcharts
import warnings
warnings.simplefilter('ignore', FutureWarning)

from treeinterpreter import treeinterpreter
from waterfall_chart import plot as waterfall
row = valid_xs_final.iloc[:5]
#row.values.shape
prediction, bias, contributions = treeinterpreter.predict(m, row.values)
prediction[0], bias[0], contributions[0].sum()
(array([9.95946805]), 10.104300126903656, -0.14483207244735555)
waterfall(valid_xs_final.columns, contributions[0], threshold=0.08, 
          rotation_value=45,formatting='{:,.3f}');
time_vars = ['SalesID','MachineID']
xs_final_time = xs_final.drop(time_vars, axis=1)
valid_xs_time = valid_xs_final.drop(time_vars, axis=1)
df_nn = pd.read_csv(path/'TrainAndValid.csv', low_memory=False)
df_nn['ProductSize'] = df_nn['ProductSize'].astype('category')
df_nn['ProductSize'].cat.set_categories(sizes, ordered=True, inplace=True)
df_nn[dep_var] = np.log(df_nn[dep_var])
df_nn = add_datepart(df_nn, 'saledate')
df_nn_final = df_nn[list(xs_final_time.columns) + [dep_var]]
df_nn_final.columns
Index(['YearMade', 'ProductSize', 'Coupler_System', 'fiProductClassDesc',
       'ModelID', 'saleElapsed', 'Hydraulics_Flow', 'fiSecondaryDesc',
       'fiModelDesc', 'ProductGroup', 'Enclosure', 'fiModelDescriptor',
       'Hydraulics', 'Drive_System', 'Tire_Size', 'Pad_Type', 'SalePrice'],
      dtype='object')
cont_nn,cat_nn = cont_cat_split(df_nn_final, max_card=9000, dep_var=dep_var)
cont_nn
['saleElapsed']
df_nn_final[cat_nn].nunique()
YearMade                73
ProductSize              6
Coupler_System           2
fiProductClassDesc      74
ModelID               5281
Hydraulics_Flow          3
fiSecondaryDesc        177
fiModelDesc           5059
ProductGroup             6
Enclosure                6
fiModelDescriptor      140
Hydraulics              12
Drive_System             4
Tire_Size               17
Pad_Type                 4
dtype: int64
cat_nn.remove('fiModelDescriptor')
procs_nn = [Categorify, FillMissing, Normalize]
to_nn = TabularPandas(df_nn_final, procs_nn, cat_nn, cont_nn,
                      splits=splits, y_names=dep_var)
dls = to_nn.dataloaders(1024)
y = to_nn.train.y
y.min(),y.max()
(8.465899, 11.863583)
learn = tabular_learner(dls, y_range=(8,12), layers=[500,250],
                        n_out=1, loss_func=F.mse_loss)
learn.fit_one_cycle(5, 1e-2)
preds,targs = learn.get_preds()
r_mse(preds,targs)
epoch train_loss valid_loss time
0 0.062922 0.062253 00:04
1 0.053299 0.058312 00:04
2 0.047690 0.053533 00:04
3 0.043369 0.052020 00:04
4 0.040263 0.050828 00:04
0.225451