### Práctica Regularización

#### Objetivo: predecir la tasa de violencia criminal para una comunidad basada en datos socioeconómicos y legales

##### Cargamos librerías y el set de datos

In [1]:
# Importamos las librerías requeridas
import numpy as np 
import pandas as pd
import matplotlib.pyplot 
%matplotlib inline
import seaborn as sns 
import warnings 
warnings.filterwarnings('ignore')

# Cargamos el dataset y mostramos las 5 primeras filas
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data'
df = pd.read_csv(url, header=None, na_values=['?'])
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,8,,,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,53,,,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,,,,,0.0,,0.67
2,24,,,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,,,,,0.0,,0.43
3,34,5.0,81440.0,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.02,0.39,0.28,,,,,0.0,,0.12
4,42,95.0,6096.0,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,,,,,0.0,,0.03


##### Corregimos los posibles errores del datasets y limpiamos los valores NaN

In [2]:
# Comprobamos si hay valores que falten
sum(df.isnull())

8128

In [3]:
# Nos quedamos con los valores que vamos a usar para la regresión (las columnas 0 - 4 sobran)
df.drop([0,1,2,3,4],axis=1,inplace=True)
# Borramos las filas donde hay valores NaN o faltantes
df.dropna(inplace=True)


##### Dividimos los datos en un set de entrenamiento y otro de test

In [4]:
# Definimos x e y
X = df.drop(127, axis=1) # x (features) son todas las columnas menos la última
y = df[127] # la variable target es la última columna del dataset

# Dividimos los datos (80% entrenamiento y 20% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state=1)



In [5]:
##### Probamos a entrenar un modelo de regresión lineal

In [6]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
print ("intersección : ",linreg.intercept_)
print ("coeficientes : ",linreg.coef_)

intersección :  1.2222288458382677
coeficientes :  [-4.42589321e+00  7.43581698e-01 -2.37354783e-01 -3.42019325e-01
 -1.92952888e-01  1.97882196e-01 -5.69565279e-01 -2.39234862e-01
  2.56346323e-01 -2.53019568e-01  4.97805236e+00 -2.29564054e-01
  1.04231718e+00 -2.00093384e-01 -2.72480752e-01 -1.07373823e+00
  2.77147368e-01  6.96250848e-01  4.66638644e-02  9.41723218e-01
 -3.83963043e-01 -4.85962474e-01 -4.58632583e-01  1.11270348e-01
 -1.56014481e-01  3.68813946e-01 -1.00864287e-01 -6.72656722e-01
  1.10883889e+00  1.68323233e-01 -1.21950733e+00 -2.53471804e-01
  7.33442742e-02  7.15942305e-01 -1.81081931e-01  9.20512294e-02
  4.17571224e-01  7.06569303e-01  1.03353131e+00 -1.47913599e-01
  8.47063463e-01 -2.34573029e+00  1.37140832e-02 -9.78225533e-01
 -1.08738443e+00  9.17727824e-01  1.77832424e-01 -6.84632936e-01
  5.71217036e-01 -2.30589336e-02  2.24301713e-02 -1.17062479e-01
  5.60860209e-01 -1.07789537e+00  1.29415850e-01  6.06418661e-01
 -2.07511082e-01  2.04985247e-01  2.047

##### Probamos a entrenar un modelo regularizado Ridge

In [7]:
# alpha=0 is equivalent to linear regression
from sklearn.linear_model import Ridge
# Probamos con alpha(equivalente a lambda) alpha=0.1
ridgereg = Ridge(alpha=0.1, normalize=True)
ridgereg.fit(X_train, y_train)
y_pred = ridgereg.predict(X_test)
print ("intersección : ",ridgereg.intercept_)
print ("coeficientes : ",ridgereg.coef_)

intersección :  0.6031478392948995
coeficientes :  [-4.19420712e-03  4.87518242e-02  4.93277241e-02 -6.70439837e-02
 -2.56807909e-02  3.00891598e-02 -2.26437683e-03  4.88607036e-03
 -1.17402526e-02  5.39130920e-02 -1.96111356e-03 -5.04991706e-02
  6.16101766e-02 -1.25202709e-01 -1.11263671e-01 -3.21017929e-01
 -5.26803286e-03  1.00386441e-01 -8.51226741e-02  8.87088856e-02
 -3.24830032e-02 -8.29007851e-03 -1.63041710e-01  4.96803864e-02
 -8.63347638e-03  1.98768114e-01  4.55137112e-02 -1.02017480e-01
  3.25293355e-02 -1.08495117e-01 -1.15823610e-01 -8.16234720e-03
  1.06339638e-01 -1.60554977e-02 -1.30128561e-01  8.00976191e-02
  8.08618327e-02  1.66357401e-02  6.62985387e-02  7.09198590e-02
 -4.22803329e-02  9.34179907e-03  8.50660007e-02 -1.41521793e-01
 -1.47018239e-01 -5.95817501e-02 -8.07355427e-02 -2.90576174e-02
  7.03720246e-02 -5.68810174e-02  1.42026318e-01 -5.85527884e-02
 -4.12406651e-03 -1.14260674e-01  5.92227293e-02  8.67904515e-02
 -4.80438851e-03  1.58145107e-03  1.934

##### Vemos el efecto en los coeficientes de la regularización Ridge. Todos se han aproximado a valores entorno al cero.

##### Probamos a entrenar un modelo regularizado Lasso

In [8]:
from sklearn.linear_model import Lasso
lassoreg = Lasso(alpha=0.005, normalize=True)
lassoreg.fit(X_train, y_train)
print(lassoreg.coef_)

[ 0.          0.          0.         -0.17297361  0.          0.
  0.          0.          0.         -0.          0.          0.
 -0.         -0.         -0.         -0.         -0.          0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.          0.          0.          0.
  0.         -0.          0.         -0.         -0.          0.
  0.         -0.          0.          0.          0.          0.
  0.         -0.         -0.50465657 -0.         -0.         -0.
 -0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
 -0.          0.          0.          0.          0.          0.
  0.         -0.          0.          0.         -0.          0.
 -0.         -0.          0.          0.         -0.          0.
  0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.          0.          0.         -0.          0.
  0.          0.         

##### Con una regularización Lasso, muchos de los coeficientes son directamente cero.

##### Ahora juega tú a cambiar los valores de alpha y descubre cómo varían los coeficientes. Te animo también a que pruebes Ridge y Lasso con validación cruzada RidgeCV y LassoCV.