fastbook 09 (DT, RF & DL) with California Housing Prices
from fastai.imports import *
from fastai.tabular.all import *
from kaggle import api
data = 'camnugent/california-housing-prices'
api.dataset_list_cli(search=data)
path = Path(data.split('/')[1])
api.dataset_download_files(data, path)
path.ls()
import zipfile
zipfile.ZipFile(path.ls()[0]).extractall(path)
path.ls()
df = pd.read_csv(path/'housing.csv', low_memory=False)
df.head()
df.columns
df.hist(figsize=(12,12))
x = "total_rooms total_bedrooms population households".split()
df[x] = np.log(df[x])
df.hist(figsize=(12,12))
cat = ['ocean_proximity']
cont = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
procs = [Categorify, FillMissing, Normalize]
to = TabularPandas(df, procs, cat, cont, 'median_house_value', RegressionBlock(), RandomSplitter()(df), reduce_memory=False).dataloaders(path='.')
xs,y = to.train.xs,to.train.y
val_xs,val_y = to.valid.xs,to.valid.y
from sklearn.ensemble import RandomForestRegressor
m = RandomForestRegressor(100, min_samples_leaf=5).fit(xs, y)
print('MAE:', abs(val_y - m.predict(val_xs)).mean())
x = pd.DataFrame({'cols':xs.columns, 'imp':m.feature_importances_}).sort_values('imp', ascending=False)
x.set_index('cols').plot(kind='barh')
x
from sklearn.tree import DecisionTreeRegressor, export_graphviz
m = DecisionTreeRegressor(max_leaf_nodes=30).fit(xs, y)
preds = m.predict(val_xs)
abs(val_y - preds).mean()
import graphviz
def draw_tree(t, df, size=10, ratio=0.6, precision=2, **kwargs):
s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, rounded=True,
special_characters=True, rotate=False, precision=precision, **kwargs)
return graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))
draw_tree(m, xs, size=10)
learn = tabular_learner(to, metrics=L1LossFlat(), layers=[10,10])
learn.lr_find(suggest_funcs=(slide, valley))
learn.fit(10, lr=0.1)
learn.recorder.plot_loss()