Tuesday, July 14, 2015

Naive Bayes classifier for english words / generic drugs

Looking to post a full text medical search application, ready to deploy, soon. In the meantime ...

Brushing off some skills and piggy backing on an article here I thought I'd show my fellow clinicians a quick use of the NLTK's naive bayesian classifier for classifying a word as an english word or as a generic drug.

Good description of the concept of a naive bayesian classifier here.

From a directory for a small one off code example:

cat /usr/share/dict/words > ./data/words.txt

Download into the same directory the drugbank freely available here and unzip it.

Both of the below files can be coped from this gist here and here.

Create a file called parse_drugbank.py with:

# Source: parse_drugbank.py
# -*- coding: utf-8 -*-
import os
import xml.sax

DATA_DIR = "./data/"

class DrugXmlContentHandler(xml.sax.ContentHandler):
    
    def __init__(self):
        xml.sax.ContentHandler.__init__(self)
        self.tags = []
        self.generic_names = []
        self.brand_names = []
        
    def startElement(self, name, attrs):
        self.tags.append(name)
    
    def endElement(self, name):
        self.tags.pop()
        
    def characters(self, content):
        breadcrumb = "/".join(self.tags)
        #if breadcrumb == "drugbank/drug/products/product/name":
        #    self.brand_names.append(content)
        if breadcrumb == "drugbank/drug/name":
            self.generic_names.append(content)
    
def write_list_to_file(lst, filename):
    fout = open(os.path.join(DATA_DIR, filename), 'wb')
    for e in lst:
        fout.write("%s\n" % (e.encode("utf-8")))
    fout.close()

    
source = open(os.path.join(DATA_DIR, "drugbank.xml"), 'rb')
handler = DrugXmlContentHandler()
xml.sax.parse(source, handler)
source.close()

write_list_to_file(handler.generic_names, "generic_names.txt")
#write_list_to_file(handler.brand_names, "brand_names.txt")

Run it.

Create a file called wordDrugClassify.py with the following, after pip install'ing the appropriate packages (at the top):

# Source: wordDrugClassify.py
# -*- coding: utf-8 -*-
# Thanks to Sujit Pal at sujitpal.blogspot.com
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import string
from operator import itemgetter
import random

GRAM_SIZE = 3

def word2ngrams(text, n=3, exact=True):
   """ Convert text into character ngrams. """
   return ["".join(j) for j in zip(*[text[i:] for i in range(n)])]

def is_punct(c):
    return c in PUNCTS
    
def is_number(c):
    return c in NUMBERS
    
PUNCTS = set([c for c in string.punctuation])
NUMBERS = set([c for c in "0123456789"])

def str_to_ngrams(instring, gram_size):
    ngrams = []
    for word in nltk.word_tokenize(instring.lower()):
        try:
            word = "".join(["S", word, "E"]).encode("utf-8")
            cword = [c for c in word if not(is_punct(c) or is_number(c))]
            ngrams.extend(["".join(x) for x in nltk.ngrams(cword, gram_size)])
        except UnicodeDecodeError:
            pass
    return ngrams

def ngram_distrib(words, gram_size):
    tokens = []
    for word in words:
        tokens.extend(str_to_ngrams(word, gram_size))
    return nltk.FreqDist(tokens)
    
def plot_ngram_distrib(fd, nbest, title, gram_size):
    kvs = sorted([(k, fd[k]) for k in fd], key=itemgetter(1), reverse=True)[0:nbest]
    ks = [k for k, v in kvs]
    vs = [v for k, v in kvs]
    plt.plot(np.arange(nbest), vs)
    plt.xticks(np.arange(nbest), ks, rotation="90")
    plt.title("%d-gram frequency for %s names (Top %d)" % 
              (gram_size, title, nbest))
    plt.xlabel("%d-grams" % (gram_size))
    plt.ylabel("Frequency")
    plt.show()
   
###

with open("./data/words.txt") as f:
   eng_words = f.read().split() 

with open("./data/generic_names.txt") as f:
   generic_names = f.read().split()
   
eng = ngram_distrib(eng_words, GRAM_SIZE)
generic = ngram_distrib(generic_names, GRAM_SIZE)

plot_ngram_distrib(eng, 30, "Eng words", GRAM_SIZE)
plot_ngram_distrib(generic, 30, "Generic drugs", GRAM_SIZE)

###

words = ([(word, 'engWord') for word in eng_words] +
   [(word, 'genericDrug') for word in generic_names]
random.shuffle(words)

###

train_words = words[1500:]
devtest_words = words[500:1500]
test_words = words[:500]

###

def word_features(word):
   features = {}

   i = 0
   for ngram in str_to_ngrams(word, 3):
 features["ngram"+str(i)] = ngram
 i+=1   

   return features

###

train_set = [(word_features(w), c) for (w,c) in train_words]
devtest_set = [(word_features(w), c) for (w,c) in devtest_words]
test_set = [(word_features(w), c) for (w,c) in test_words]

classifier = nltk.NaiveBayesClassifier.train(train_set)

###

errors = []
for (word, tag) in devtest_words:
    guess = classifier.classify(word_features(word))
    if guess != tag:
        errors.append( (tag, guess, word) )
        
for (tag, guess, word) in sorted(errors):
    print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, word)

###

if __name__ == '__main__':
   while True:
 print
 print "% accurate: ",
 print nltk.classify.accuracy(classifier, devtest_set)
 print "Hit enter with no input to quit."
 
 query = raw_input("Query:")
 if query == '':
 break
 else:
print classifier.classify(word_features(query))

As you can see, the error analysis shows us >96% on our dev-test set... this example doesnt even bother moving on to the test set. 

With just some simple features, here the ngrams, we were able to predict with a high degree of accuracy whether a word was a generic drug or an english word.

--JG