! [ -e /content ] && pip install -Uqq fastbook kaggle waterfallcharts treeinterpreter dtreeviz
import fastbook
fastbook.setup_book()
from fastbook import *
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from dtreeviz.trees import *
from IPython.display import Image, display_svg, SVG

pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

path = Path('/root/.fastai/archive/bluebook-for-bulldozers')
Path.BASE_PATH = path
path.ls(file_type='text')

(#7) [Path('Machine_Appendix.csv'),Path('TrainAndValid.csv'),Path('random_forest_benchmark_test.csv'),Path('Test.csv'),Path('median_benchmark.csv'),Path('ValidSolution.csv'),Path('Valid.csv')]

df = pd.read_csv(path/'TrainAndValid.csv', low_memory=False)
df.columns

Index(['SalesID', 'SalePrice', 'MachineID', 'ModelID', 'datasource',
       'auctioneerID', 'YearMade', 'MachineHoursCurrentMeter', 'UsageBand',
       'saledate', 'fiModelDesc', 'fiBaseModel', 'fiSecondaryDesc',
       'fiModelSeries', 'fiModelDescriptor', 'ProductSize',
       'fiProductClassDesc', 'state', 'ProductGroup', 'ProductGroupDesc',
       'Drive_System', 'Enclosure', 'Forks', 'Pad_Type', 'Ride_Control',
       'Stick', 'Transmission', 'Turbocharged', 'Blade_Extension',
       'Blade_Width', 'Enclosure_Type', 'Engine_Horsepower', 'Hydraulics',
       'Pushblock', 'Ripper', 'Scarifier', 'Tip_Control', 'Tire_Size',
       'Coupler', 'Coupler_System', 'Grouser_Tracks', 'Hydraulics_Flow',
       'Track_Type', 'Undercarriage_Pad_Width', 'Stick_Length', 'Thumb',
       'Pattern_Changer', 'Grouser_Type', 'Backhoe_Mounting', 'Blade_Type',
       'Travel_Controls', 'Differential_Type', 'Steering_Controls'],
      dtype='object')

str(df.ProductSize.unique()[1:])

"['Medium' 'Small' 'Large / Medium' 'Mini' 'Large' 'Compact']"

sizes = 'Large', 'Large / Medium', 'Medium', 'Small',  'Mini', 'Compact'
df.ProductSize = df.ProductSize.astype('category')
df.ProductSize.cat.set_categories(sizes, ordered=True, inplace=True)
df.ProductSize

/root/mambaforge/lib/python3.9/site-packages/pandas/core/arrays/categorical.py:2747: FutureWarning: The `inplace` parameter in pandas.Categorical.set_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  res = method(*args, **kwargs)

0            NaN
1         Medium
2            NaN
3          Small
4            NaN
           ...  
412693      Mini
412694      Mini
412695      Mini
412696      Mini
412697      Mini
Name: ProductSize, Length: 412698, dtype: category
Categories (6, object): ['Large' < 'Large / Medium' < 'Medium' < 'Small' < 'Mini' < 'Compact']

dep_var = 'SalePrice'
df[dep_var] = np.log(df[dep_var])

len(df.columns)

53

df = add_datepart(df, 'saledate')
len(df.columns)

65

' '.join(o for o in df.columns if o.startswith('sale'))

'saleYear saleMonth saleWeek saleDay saleDayofweek saleDayofyear saleIs_month_end saleIs_month_start saleIs_quarter_end saleIs_quarter_start saleIs_year_end saleIs_year_start saleElapsed'

procs = [Categorify, FillMissing]

cond = (df.saleYear<2011) | (df.saleMonth<10)
train_idx = np.where(cond)[0]
valid_idx = np.where(~cond)[0]

splits = (train_idx.tolist(), valid_idx.tolist())
#splits

cont, cat = cont_cat_split(df, 1, dep_var=dep_var)

to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits)
to.classes["ProductSize"]

['#na#', 'Large', 'Large / Medium', 'Medium', 'Small', 'Mini', 'Compact']

save_pickle(path/'to.pkl', to)

to = load_pickle(path/'to.pkl')

xs, y = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

m = DecisionTreeRegressor(max_leaf_nodes=4)
m.fit(xs, y)
draw_tree(m, xs, size=10, leaves_parallel=True, precision=2)

xs.loc[xs.YearMade<1900, 'YearMade'] = 1950
valid_xs.loc[valid_xs.YearMade<1900, 'YearMade'] = 1950

def r_mse(pred,y): return round(math.sqrt(((pred-y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)

m = DecisionTreeRegressor(min_samples_leaf=25).fit(to.train.xs, to.train.y)
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y), m.get_n_leaves()

(0.248595, 0.323441, 12397)

def rf(xs, y, n_estimators=40, max_samples=200_000,
      max_features=0.5, min_samples_leaf=5, **kwargs):
    m = RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
                              max_samples=max_samples, max_features=max_features,
                              min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)
    return m

m = rf(xs, y)

m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)

(0.170917, 0.232626)

def rf_feat_importance(m, df):
    df_ret = pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                         ).sort_values('imp', ascending=False)
    return df_ret

fi = rf_feat_importance(m, xs)
fi[:10]

def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12, 7), legend=False)
plot_fi(fi[:30])

<AxesSubplot:ylabel='cols'>

to_keep = fi[fi.imp>0.005].cols
len(to_keep)

22

xs_imp = xs[to_keep]
valid_xs_imp = valid_xs[to_keep]

m = rf(xs_imp, y)
m_rmse(m, xs_imp, y), m_rmse(m, valid_xs_imp, valid_y),

(0.180775, 0.23147)

plot_fi(rf_feat_importance(m, xs_imp))

<AxesSubplot:ylabel='cols'>

cluster_columns(xs_imp)

def get_oob(df):
    m = RandomForestRegressor(n_estimators=40, min_samples_leaf=15,
                              max_samples=50000, max_features=0.5, n_jobs=-1, oob_score=True)
    m.fit(df, y)
    return m.oob_score_

get_oob(xs_imp)

0.8778685460797769

{c:get_oob(xs_imp.drop(c, axis=1)) for c in (
    'saleYear', 'saleElapsed')}#, 'ProductGroupDesc','ProductGroup',
#    'fiModelDesc', 'fiBaseModel',
#    'Hydraulics_Flow','Grouser_Tracks', 'Coupler_System')}

{'saleYear': 0.8761478210481896, 'saleElapsed': 0.872101137022772}

to_drop = ['saleYear', 'ProductGroupDesc', 'fiBaseModel', 'Grouser_Tracks']
get_oob(xs_imp.drop(to_drop, axis=1))

0.874826620033826

xs_final = xs_imp.drop(to_drop, axis=1)
valid_xs_final = valid_xs_imp.drop(to_drop, axis=1)
m = rf(xs_final, y)
m_rmse(m, xs_final, y), m_rmse(m, valid_xs_final, valid_y)

(0.182696, 0.233411)

p = valid_xs_final['ProductSize'].value_counts(sort=False).plot.barh()
c = to.classes['ProductSize']
plt.yticks(range(len(c)), c);

ax = valid_xs_final['YearMade'].hist()

from sklearn.inspection import plot_partial_dependence

fig,ax = plt.subplots(figsize=(12, 4))
plot_partial_dependence(m, valid_xs_final, ['YearMade','ProductSize'],
                        grid_resolution=20, ax=ax);

/root/mambaforge/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_partial_dependence is deprecated; Function `plot_partial_dependence` is deprecated in 1.0 and will be removed in 1.2. Use PartialDependenceDisplay.from_estimator instead
  warnings.warn(msg, category=FutureWarning)

!pip install -Uqq --user treeinterpreter

!pip install -Uqq --user waterfallcharts

import warnings
warnings.simplefilter('ignore', FutureWarning)

from treeinterpreter import treeinterpreter
from waterfall_chart import plot as waterfall

row = valid_xs_final.iloc[:5]
#row.values.shape

prediction, bias, contributions = treeinterpreter.predict(m, row.values)

prediction[0], bias[0], contributions[0].sum()

(array([9.95946805]), 10.104300126903656, -0.14483207244735555)

waterfall(valid_xs_final.columns, contributions[0], threshold=0.08, 
          rotation_value=45,formatting='{:,.3f}');

time_vars = ['SalesID','MachineID']
xs_final_time = xs_final.drop(time_vars, axis=1)
valid_xs_time = valid_xs_final.drop(time_vars, axis=1)

df_nn = pd.read_csv(path/'TrainAndValid.csv', low_memory=False)
df_nn['ProductSize'] = df_nn['ProductSize'].astype('category')
df_nn['ProductSize'].cat.set_categories(sizes, ordered=True, inplace=True)
df_nn[dep_var] = np.log(df_nn[dep_var])
df_nn = add_datepart(df_nn, 'saledate')

df_nn_final = df_nn[list(xs_final_time.columns) + [dep_var]]

df_nn_final.columns

Index(['YearMade', 'ProductSize', 'Coupler_System', 'fiProductClassDesc',
       'ModelID', 'saleElapsed', 'Hydraulics_Flow', 'fiSecondaryDesc',
       'fiModelDesc', 'ProductGroup', 'Enclosure', 'fiModelDescriptor',
       'Hydraulics', 'Drive_System', 'Tire_Size', 'Pad_Type', 'SalePrice'],
      dtype='object')

cont_nn,cat_nn = cont_cat_split(df_nn_final, max_card=9000, dep_var=dep_var)

cont_nn

['saleElapsed']

df_nn_final[cat_nn].nunique()

YearMade                73
ProductSize              6
Coupler_System           2
fiProductClassDesc      74
ModelID               5281
Hydraulics_Flow          3
fiSecondaryDesc        177
fiModelDesc           5059
ProductGroup             6
Enclosure                6
fiModelDescriptor      140
Hydraulics              12
Drive_System             4
Tire_Size               17
Pad_Type                 4
dtype: int64

cat_nn.remove('fiModelDescriptor')

procs_nn = [Categorify, FillMissing, Normalize]
to_nn = TabularPandas(df_nn_final, procs_nn, cat_nn, cont_nn,
                      splits=splits, y_names=dep_var)
dls = to_nn.dataloaders(1024)
y = to_nn.train.y
y.min(),y.max()

(8.465899, 11.863583)

learn = tabular_learner(dls, y_range=(8,12), layers=[500,250],
                        n_out=1, loss_func=F.mse_loss)
learn.fit_one_cycle(5, 1e-2)
preds,targs = learn.get_preds()
r_mse(preds,targs)

0.225451

epoch	train_loss	valid_loss	time
0	0.062922	0.062253	00:04
1	0.053299	0.058312	00:04
2	0.047690	0.053533	00:04
3	0.043369	0.052020	00:04
4	0.040263	0.050828	00:04

	cols	imp
57	YearMade	0.173388
6	ProductSize	0.110295
30	Coupler_System	0.100494
7	fiProductClassDesc	0.069326
54	ModelID	0.055765
65	saleElapsed	0.050687
32	Hydraulics_Flow	0.048636
3	fiSecondaryDesc	0.047010
31	Grouser_Tracks	0.044165
1	fiModelDesc	0.031284