In [96]:
import pandas as pd
import numpy as np
import csv
import gzip
import string
from random import random, sample
rs = 2 # random state

In [97]:
df = pd.read_csv('complaints.csv', usecols=[ 'Issue',  
                       'Consumer complaint narrative', 
                       'Company response to consumer', ])
df.dropna(inplace=True) 
df = df.rename(columns={
    "Company response to consumer": "response",
    "Consumer complaint narrative": "narrative",})
print(df.shape)

(18838, 3)


In [98]:
df

Unnamed: 0,Issue,narrative,response
0,Credit card protection / Debt protection,On XXXX/XXXX/15 someone contacted synchrony ba...,Closed with explanation
1,Cash advance fee,"A credit card issued by Chase Bank stated "" Ca...",Closed with monetary relief
2,Sale of account,I was told by a Kay Jewelers employee that the...,Closed with monetary relief
3,Billing disputes,I had a credit card issued by Citi. According ...,Closed with monetary relief
4,Transaction issue,Fraud occurred on my Bank of America ( BoA ) c...,Closed with monetary relief
...,...,...,...
18833,Other,American Express continues to harass and take ...,Closed with explanation
18834,Billing disputes,We stayed at a property advertised by XXXX in ...,Closed with explanation
18835,Arbitration,XX/XX/XXXX To Whom it May Concern : My name is...,Closed with monetary relief
18836,APR or interest rate,i am being charged interest on cash advance wh...,Closed with explanation


In [99]:
df['response'].value_counts()[:10]

Closed with explanation            12241
Closed with monetary relief         4328
Closed with non-monetary relief     2140
Closed                               105
Untimely response                     24
Name: response, dtype: int64

In [100]:
target = 'response'
vc = df[target].value_counts()[:3]
classes = list(vc.keys())
obs = vc.values
[ (i, classes[i], obs[i]) for i in range(len(classes)) ]

[(0, 'Closed with explanation', 12241),
 (1, 'Closed with monetary relief', 4328),
 (2, 'Closed with non-monetary relief', 2140)]

In [101]:
mask = [bool(x in classes) for x in df[target]] 
df = df[mask] 
df.shape

(18709, 3)

In [102]:
minobs = min(obs)
print(minobs)

2140


In [103]:
df = df.groupby(target).sample(n=minobs, random_state=rs)
# shuffle
df = df.sample(frac=1, random_state=rs).reset_index(drop=True)
print(df.shape)
print(df[target].value_counts())

(6420, 3)
Closed with monetary relief        2140
Closed with non-monetary relief    2140
Closed with explanation            2140
Name: response, dtype: int64


In [104]:
df['narrative'][:5]

0    I was misled and charged an annual fee of {$75...
1    I generally pay my Wells Fargo credit card bal...
2    I recently signed up for an American Express c...
3    I have a gold amex that was issued with no ann...
4    I have been a cardholder for 16 years. The ini...
Name: narrative, dtype: object

In [105]:
df['narrative'].str.len().mean()

1138.4515576323988

In [106]:
np.mean([ len(lst) for lst in [ x.split() for x in df['narrative'] ] ])

210.10155763239877

In [107]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction import text 
maxf = 500
vctr = CountVectorizer(
    lowercase=True, 
    binary=True, 
    dtype=np.int8,
    max_features=maxf, 
    #ngram_range=(1,2),
    stop_words=text.ENGLISH_STOP_WORDS.union(('xxxx', 'xxxx/xxxx', 
                                              'xx/xx/xxxx')).difference(('not',)),
    token_pattern=r'\b[a-zA-Z]{3,}\b'
    )
vctr.fit(df['narrative'])

CountVectorizer(binary=True, dtype=<class 'numpy.int8'>, max_features=500,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                token_pattern='\\b[a-zA-Z]{3,}\\b')

In [108]:
voc = vctr.vocabulary_

print([ (w, voc[w]) for w in sorted(voc, key=voc.get)][:100])

[('able', 0), ('accepted', 1), ('access', 2), ('according', 3), ('account', 4), ('accounts', 5), ('act', 6), ('action', 7), ('activity', 8), ('actually', 9), ('added', 10), ('addition', 11), ('additional', 12), ('address', 13), ('advised', 14), ('agencies', 15), ('agency', 16), ('agent', 17), ('ago', 18), ('agreed', 19), ('agreement', 20), ('allow', 21), ('allowed', 22), ('america', 23), ('american', 24), ('amex', 25), ('amounts', 26), ('annual', 27), ('answer', 28), ('application', 29), ('applied', 30), ('apply', 31), ('approved', 32), ('approximately', 33), ('apr', 34), ('ask', 35), ('asked', 36), ('asking', 37), ('assured', 38), ('attached', 39), ('attempt', 40), ('attempted', 41), ('attempts', 42), ('authorized', 43), ('auto', 44), ('automatic', 45), ('available', 46), ('aware', 47), ('away', 48), ('bad', 49), ('balance', 50), ('balances', 51), ('bank', 52), ('banking', 53), ('banks', 54), ('based', 55), ('believe', 56), ('benefit', 57), ('best', 58), ('billing', 59), ('bills', 60)

In [109]:
df['narrative'].values[0]

'I was misled and charged an annual fee of {$75.00} on my first statement. \nAn annual fee was not indicated when I applied for the card. \n\nThe application was made on an XXXX flight. \nAirplane attendants discussed and gave out applications which were then completed and given back ti the attendants.'

In [110]:
anlz = vctr.build_analyzer()
toks = anlz(df['narrative'].values[0])
print(toks)

['misled', 'charged', 'annual', 'fee', 'statement', 'annual', 'fee', 'not', 'indicated', 'applied', 'card', 'application', 'flight', 'airplane', 'attendants', 'discussed', 'gave', 'applications', 'completed', 'given', 'attendants']


In [111]:
print(vctr.transform(df['narrative'].values[:1])[0,:])

  (0, 27)	1
  (0, 29)	1
  (0, 30)	1
  (0, 73)	1
  (0, 83)	1
  (0, 107)	1
  (0, 177)	1
  (0, 193)	1
  (0, 195)	1
  (0, 219)	1
  (0, 283)	1
  (0, 434)	1


In [112]:
X = vctr.transform(df['narrative'])

#X = np.asarray(vctr.transform(df['narrative']).todense(), dtype='int8')

print(X.shape)

(6420, 500)


In [113]:
Y = np.array([ classes.index(x)  for x in df[target] ], dtype='int8')

print(Y[:10])

[1 1 2 1 1 2 1 1 0 0]


In [114]:
print(X.shape, Y.shape)

(6420, 500) (6420,)


In [115]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def mlp(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
    print('training size:', X_train.shape[0], 
          'testing size:', X_test.shape[0],
          'label counts:', np.unique(y_train, return_counts=True)[1])

    clf = MLPClassifier(max_iter=10).fit(X_train, y_train)

    print('score train:', clf.score(X_train, y_train))
    print('score test: ', clf.score(X_test, y_test))
    
mlp(X, Y)

training size: 5136 testing size: 1284 label counts: [1739 1699 1698]




score train: 0.7978971962616822
score test:  0.5171339563862928


In [116]:
glove = {}
for line in open('glove.6B.50d.txt'):
    lst = line.split()
    glove[lst[0]] = np.asarray([ float(x) for x in lst[1:]])
print(len(glove))
print(list(glove.keys())[:20])

def pr():
    for w in ('the', 'fee'):
        print(w, ':', glove[w][:5])
    
pr()

400000
['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s", 'for', '-', 'that', 'on', 'is', 'was', 'said', 'with', 'he', 'as']
the : [ 0.418    0.24968 -0.41242  0.1217   0.34527]
fee : [ 0.38186  1.3533   0.54711 -0.66526 -0.5935 ]


In [117]:
def embed(lst):
    emb = np.array([ 0.001 for x in glove['the'] ])
    n = 1
    for w in lst:
        if w in glove: 
            emb += glove[w]
            n += 1
    return emb / n

for x in df['narrative']: 
    print(anlz(x))
    print(embed(anlz(x))[:5])
    break
     
X = np.array([ embed(anlz(x)) for x in df['narrative'] ]) 
X = (X - X.mean(0)) / X.std(0)
print(X.shape, Y.shape)

mlp(X, Y)

['misled', 'charged', 'annual', 'fee', 'statement', 'annual', 'fee', 'not', 'indicated', 'applied', 'card', 'application', 'flight', 'airplane', 'attendants', 'discussed', 'gave', 'applications', 'completed', 'given', 'attendants']
[ 0.3895699   0.15705132  0.10203298 -0.06277164  0.13725309]
(6420, 50) (6420,)
training size: 5136 testing size: 1284 label counts: [1739 1699 1698]




score train: 0.5780763239875389
score test:  0.5264797507788161
