Decision Tree¶

A simple but potentially powerful machine learning technique, the decision tree has some very appealing properties:

it is easy to implement
results are easily explained

Here is some sample code to pre-process the text in the sentiment file:

import numpy as np

data = [ line.split() for line in open('imdb_labelled.txt').readlines() ]

#  '.' at end of last word in each sentence, remove with strip()
sents = [ [ w.strip('.,').lower() for w in line[:-1] ] for line in data ]
y = np.asarray([ int(line[-1]) for line in data ])

# remove stop words
stop = ['a','-','the','it','in','and','or','with','to','of','as','was','is',"it's",'from','for','this','on','at']
sents = [ [ w for w in s if w not in stop ] for s in sents ]

print(sents[:5])
print('mean sentence length:', np.mean([len(s) for s in sents]))
print(y[:5])
print(len(y), sum(y))

[['very', 'very', 'very', 'slow-moving', 'aimless', 'movie', 'about', 'distressed', 'drifting', 'young', 'man'], ['not', 'sure', 'who', 'more', 'lost', 'flat', 'characters', 'audience', 'nearly', 'half', 'whom', 'walked', 'out'], ['attempting', 'artiness', 'black', '&', 'white', 'clever', 'camera', 'angles', 'movie', 'disappointed', 'became', 'even', 'more', 'ridiculous', 'acting', 'poor', 'plot', 'lines', 'almost', 'non-existent'], ['very', 'little', 'music', 'anything', 'speak'], ['best', 'scene', 'movie', 'when', 'gerardo', 'trying', 'find', 'song', 'that', 'keeps', 'running', 'through', 'his', 'head']]
mean sentence length: 10.185
[0 0 0 0 1]
1000 500

Now we encode simply be membership in a small vocabulary:

from collections import defaultdict

cnt = defaultdict(int)

for s in sents:
    for w in s:
        cnt[w] += 1
        
voc = [ w for w in cnt if cnt[w] > 20 ]
print('voc len:', len(voc))
X = [ [ int(w in sent) for w in voc ] for sent in sents ]

voc len: 77

This is enough to start training and evaluating our decision tree:

from sklearn import tree
from sklearn import metrics 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

clf = tree.DecisionTreeClassifier(max_depth=3)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.59

Right away we achieve a moderately reasonable performance.

In addition, we can now visualize the process:

from sklearn.tree import export_graphviz
import graphviz

export_graphviz(clf, feature_names=voc, out_file="mytree.dot" )
with open("mytree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

This gives us the chance to quickly understand how the machine learning method arrives at its result.

Here is a version of the decision tree with nicer layout; however, it involves some coding to get just what we want.

data = [ line.split() for line in open('imdb_labelled.txt').readlines() ]

#  '.' at end of last word in each sentence, remove with strip()
sents = [ [ w.strip('.,').lower() for w in line[:-1] ] for line in data ]
y = np.asarray([ int(line[-1]) for line in data ])

cnt = defaultdict(int)

for s in sents:
    for w in s:
        cnt[w] += 1
        
# remove stop words
stop = 'a,-,the,it,in,and,or,with,to,of,as,was,is,it\'s,from,for,this,on,at,do,been,has,her'
stop += ',after,part,90\'s,we,i\'ve,were,its'
sents = [ [ w for w in s if w not in stop.split(',') ] for s in sents ]

voc = [ w for w in cnt if cnt[w] > 2 and cnt[w] < 50 ]
print('voc len:', len(voc))
print(voc[:50])

# 0-->1 for graph labels
X = [ [ int(w not in sent) for w in voc ] for sent in sents ]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
print('sum(y_test):', sum(y_test))

clf = tree.DecisionTreeClassifier(max_depth=13)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

export_graphviz(clf, feature_names=voc, class_names=['neg','pos'], out_file="mytree.dot" )
g = open("mytree.dot").read()
print(g[:300])

voc len: 662
['about', 'young', 'man', 'sure', 'who', 'more', 'lost', '-', 'characters', 'or', 'audience', 'half', 'out', 'black', '&', 'white', 'clever', 'camera', 'disappointed', 'even', 'acting', 'poor', 'plot', 'lines', 'almost', 'little', 'music', 'anything', 'speak', 'best', 'scene', 'when', 'trying', 'find', 'song', 'through', 'his', 'rest', 'lacks', 'art', 'meaning', 'if', 'works', 'guess', 'because', 'wasted', 'two', 'hours', 'saw', 'today']
sum(y_test): 51
Accuracy: 0.57
digraph Tree {
node [shape=box] ;
0 [label="great <= 0.5\ngini = 0.5\nsamples = 900\nvalue = [451, 449]\nclass = neg"] ;
1 [label="would <= 0.5\ngini = 0.17\nsamples = 32\nvalue = [3, 29]\nclass = pos"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="gini = 0.0\nsamples = 1

It turns out that the input to the graphviz module is really a string, so we can manipulate that content to change the labels into just word, number of samples, and class. This makes the tree more readable and also smaller, so we can go to a higer value for the depth and still fit it nicely on the page.

import re
g = re.sub(r" <= \d\.\d+\\n", "", g)
g = re.sub(r"gini = \d\.\d+\\n", "\\n", g)
g = re.sub(r"value = \[\d+, \d+\]\\n", "", g)
g = re.sub(r"samples = ", "", g)
g = re.sub(r"\\nclass = ", " ", g)
#print(g)
graphviz.Source(g)