!wget -nc https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls

File ‘default of credit card clients.xls’ already there; not retrieving.

import pandas as pd
import numpy as np
np.set_printoptions(formatter={'all':lambda x: '%6.2f' % x})

df = pd.read_excel('default of credit card clients.xls', skiprows=(1))
df = df.rename(columns={'default payment next month':'DEF', 'PAY_0':'PAY_1'})
df.head()

df.value_counts('DEF')

DEF
0    23364
1     6636
dtype: int64

minobs = min(df.value_counts('DEF').values)
print(minobs)

6636

df = df.groupby('DEF').sample(n=minobs, random_state=1).sample(frac=1, random_state=1)
df.iloc[:5,:]

print(df['DEF'].value_counts())

1    6636
0    6636
Name: DEF, dtype: int64

X = np.asarray(df.iloc[:,1:-1])

Xmax = X.max(0)

X = X / Xmax
#X = (X - X.mean(0)) / X.std(0)

print(X)

[[  0.29   1.00   0.17 ...   0.00   0.00   0.01]
 [  0.25   0.50   0.17 ...   0.00   0.00   0.00]
 [  0.05   0.50   0.17 ...   0.00   0.00   0.00]
 ...
 [  0.54   1.00   0.17 ...   0.00   0.01   0.00]
 [  0.38   0.50   0.17 ...   0.02   0.03   0.02]
 [  0.04   0.50   0.33 ...   0.00   0.01   0.00]]

Y = np.asarray(df['DEF'], dtype='int8')
print(Y)

[  1.00   0.00   0.00 ...   0.00   1.00   0.00]

n = 10
x = np.asarray(df[['PAY_6','PAY_5','PAY_4','PAY_3','PAY_2','PAY_1']][:n])
print(x)

[[ -1.00  -1.00  -1.00  -1.00   2.00   2.00]
 [ -2.00  -2.00  -2.00  -2.00  -2.00  -2.00]
 [  2.00   2.00   2.00   2.00   2.00   2.00]
 [  0.00  -1.00  -1.00  -1.00   2.00   1.00]
 [  2.00  -1.00  -1.00   2.00   2.00   1.00]
 [  0.00   0.00   0.00   0.00   0.00   0.00]
 [ -2.00  -2.00  -2.00  -2.00  -2.00  -2.00]
 [  0.00   0.00   0.00   0.00   0.00   0.00]
 [  0.00   0.00   0.00   0.00   0.00   0.00]
 [ -1.00  -1.00  -2.00  -2.00  -2.00   1.00]]

y = np.asarray(df[['DEF']][:n])[:,0]
print(y)

[  1.00   0.00   0.00   1.00   1.00   0.00   0.00   1.00   0.00   0.00]

hid = 5
U = np.random.random((hid, x.shape[1])) * 4 - 2
print(U)

[[ -1.12   1.25  -0.57   1.90   1.26  -0.93]
 [ -0.43  -0.96   1.36   0.81   0.78  -0.69]
 [ -0.82  -2.00   0.60   0.76   0.71  -0.37]
 [  0.78   0.43  -1.16   1.44  -1.17  -1.13]
 [  1.97   0.74  -1.03  -0.33  -0.95   0.18]]

V = np.random.random((1,hid)) * 4 - 2
print(V)

[[  1.47  -1.51   0.48   0.33  -1.24]]

print(np.dot(U, x[0]))

[ -0.80  -0.60   2.13  -6.08  -2.89]

print(np.dot(U, x.T))

[[ -0.80  -3.57   3.57  -0.99   2.48   0.00  -3.57   0.00   0.00  -6.23]
 [ -0.60  -1.73   1.73  -0.34   1.24   0.00  -1.73   0.00   0.00  -5.20]
 [  2.13   2.26  -2.26   1.68   2.31   0.00   2.26   0.00   0.00  -1.68]
 [ -6.08   1.61  -1.61  -4.18   1.70   0.00   1.61   0.00   0.00  -0.56]
 [ -2.89  -1.15   1.15  -1.10   1.85   0.00  -1.15   0.00   0.00   2.10]]

import matplotlib.pyplot as plt
a = np.linspace(-5, 5, 1000)
b = 1 / (1 + np.exp(-a) )
plt.figure(figsize=(8, 3))
plt.plot(a, b)

[<matplotlib.lines.Line2D at 0x7f93c0899e48>]

h = 1 / (1 + np.exp(-1 * np.dot(U, x.T)))
print(h)

[[  0.31   0.03   0.97   0.27   0.92   0.50   0.03   0.50   0.50   0.00]
 [  0.35   0.15   0.85   0.42   0.78   0.50   0.15   0.50   0.50   0.01]
 [  0.89   0.91   0.09   0.84   0.91   0.50   0.91   0.50   0.50   0.16]
 [  0.00   0.83   0.17   0.02   0.85   0.50   0.83   0.50   0.50   0.36]
 [  0.05   0.24   0.76   0.25   0.86   0.50   0.24   0.50   0.50   0.89]]

o = 1/(1 + np.exp(-1 * np.dot(V, h)))
print(o)

[[  0.57   0.56   0.33   0.47   0.46   0.44   0.56   0.44   0.44   0.29]]

print(o-y)

[[ -0.43   0.56   0.33  -0.53  -0.54   0.44   0.56  -0.56   0.44   0.29]]

c = np.sum((o-y)**2)/n
print(c)

0.22738725407473867

c = 1
e = 0
while c > 0.1:
    U = np.random.random((hid, x.shape[1])) * 4 - 2
    V = np.random.random((1,hid)) * 4 - 2
    h = 1/(1 + np.exp(-1 * np.dot(U, x.T)))
    o = 1/(1 + np.exp(-1 * np.dot(V, h)))
    c = np.sum((o-y)**2)/n
    e += 1
    
print('y:     ', np.asarray([y])*1.0)
print('o:     ', o)
print('sq.err:', (o-y)**2)
print('cost:  ', c)
print('epochs:', e)

y:      [[  1.00   0.00   0.00   1.00   1.00   0.00   0.00   1.00   0.00   0.00]]
o:      [[  0.96   0.18   0.43   0.94   0.89   0.29   0.18   0.29   0.29   0.21]]
sq.err: [[  0.00   0.03   0.19   0.00   0.01   0.08   0.03   0.51   0.08   0.04]]
cost:   0.09835989425654287
epochs: 24657

d = np.asarray([int(v>0.5) for v in o[0]])
print('predict: ', d)
print('actual:  ', y)
print('accuracy:', np.sum(y==d)/n)

predict:  [  1.00   0.00   0.00   1.00   1.00   0.00   0.00   0.00   0.00   0.00]
actual:   [  1.00   0.00   0.00   1.00   1.00   0.00   0.00   1.00   0.00   0.00]
accuracy: 0.9

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

print(X.shape, Y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print('training size:', X_train.shape[0], 
      'testing size:', X_test.shape[0],
      'label counts:', np.unique(y_train, return_counts=True)[1])

clf = MLPClassifier(hidden_layer_sizes=(100,20,),
                    max_iter=50).fit(X_train, y_train)

print('score train:', clf.score(X_train, y_train))
print('score test: ', clf.score(X_test, y_test))

(13272, 23) (13272,)
training size: 10617 testing size: 2655 label counts: [5347.00 5270.00]
score train: 0.7246868230196855
score test:  0.7054613935969868

/home/hugo/.local/lib/python3.6/site-packages/sklearn/neural_network/_multilayer_perceptron.py:617: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (50) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)

print('pred prob: ', clf.predict_proba(X[:1]))
print('pred class:', clf.predict(X[:1]))

pred prob:  [[  0.31   0.69]]
pred class: [  1.00]

import pickle

pickle.dump((clf, Xmax), open('ccd.pkl', 'wb'))

clf2, Xmax2 = pickle.load(open('ccd.pkl', 'rb'))

print(clf2.predict_proba(df.iloc[:1,1:-1] /Xmax2 ))

[[  0.31   0.69]]

p = clf.predict(X)

p.shape

(13272,)

sum(p==1), sum(Y==1), sum(p==0), sum(Y==0)

(5605, 6636, 7667, 6636)

	ID	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_1	PAY_2	PAY_3	PAY_4	...	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	DEF
0	1	20000	2	2	1	24	2	2	-1	-1	...	0	0	0	0	689	0	0	0	0	1
1	2	120000	2	2	2	26	-1	2	0	0	...	3272	3455	3261	0	1000	1000	1000	0	2000	1
2	3	90000	2	2	2	34	0	0	0	0	...	14331	14948	15549	1518	1500	1000	1000	1000	5000	0
3	4	50000	2	2	1	37	0	0	0	0	...	28314	28959	29547	2000	2019	1200	1100	1069	1000	0
4	5	50000	1	2	1	57	-1	0	-1	0	...	20940	19146	19131	2000	36681	10000	9000	689	679	0

	ID	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_1	PAY_2	PAY_3	PAY_4	...	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	DEF
2231	2232	230000	2	1	2	49	2	2	-1	-1	...	0	2119	0	0	1100	0	2119	0	2916	1
6574	6575	200000	1	1	1	38	-2	-2	-2	-2	...	4115	1315	0	2658	0	4125	1315	0	0	0
13810	13811	40000	1	1	1	47	2	2	2	2	...	12595	14386	14005	2000	1000	0	2000	0	2000	0
7934	7935	180000	2	2	2	28	1	2	-1	-1	...	0	22357	21541	0	1600	0	22357	1500	785	1
7277	7278	150000	2	1	1	44	1	2	2	-1	...	10195	24232	22000	0	0	10195	24232	0	0	1

Machine Learning for RPA: Numeric Data¶

UCI ML Credit Card Defaults Dataset¶

Imbalance in the Number of Observations per Class¶

Scaling¶

Feed-Forward Neural Net¶

Not a Magic Black Box¶

Deep Learning¶

MLPClassifier¶

Persistent ML Model¶

Misc¶