Looking to post a full text medical search application, ready to deploy, soon. In the meantime ...
Brushing off some skills and piggy backing on an article here I thought I'd show my fellow clinicians a quick use of the NLTK's naive bayesian classifier for classifying a word as an english word or as a generic drug.
Good description of the concept of a naive bayesian classifier here.
From a directory for a small one off code example:
cat /usr/share/dict/words > ./data/words.txt
Download into the same directory the drugbank freely available here and unzip it.
Both of the below files can be coped from this gist here and here.
Create a file called parse_drugbank.py with:
# Source: parse_drugbank.py
# -*- coding: utf-8 -*-
import os
import xml.sax
DATA_DIR = "./data/"
class DrugXmlContentHandler(xml.sax.ContentHandler):
def __init__(self):
xml.sax.ContentHandler.__init__(self)
self.tags = []
self.generic_names = []
self.brand_names = []
def startElement(self, name, attrs):
self.tags.append(name)
def endElement(self, name):
self.tags.pop()
def characters(self, content):
breadcrumb = "/".join(self.tags)
#if breadcrumb == "drugbank/drug/products/product/name":
# self.brand_names.append(content)
if breadcrumb == "drugbank/drug/name":
self.generic_names.append(content)
def write_list_to_file(lst, filename):
fout = open(os.path.join(DATA_DIR, filename), 'wb')
for e in lst:
fout.write("%s\n" % (e.encode("utf-8")))
fout.close()
source = open(os.path.join(DATA_DIR, "drugbank.xml"), 'rb')
handler = DrugXmlContentHandler()
xml.sax.parse(source, handler)
source.close()
write_list_to_file(handler.generic_names, "generic_names.txt")
#write_list_to_file(handler.brand_names, "brand_names.txt")
Run it.
Create a file called wordDrugClassify.py with the following, after pip install'ing the appropriate packages (at the top):
# Source: wordDrugClassify.py
# -*- coding: utf-8 -*-
# Thanks to Sujit Pal at sujitpal.blogspot.com
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import string
from operator import itemgetter
import random
GRAM_SIZE = 3
def word2ngrams(text, n=3, exact=True):
""" Convert text into character ngrams. """
return ["".join(j) for j in zip(*[text[i:] for i in range(n)])]
def is_punct(c):
return c in PUNCTS
def is_number(c):
return c in NUMBERS
PUNCTS = set([c for c in string.punctuation])
NUMBERS = set([c for c in "0123456789"])
def str_to_ngrams(instring, gram_size):
ngrams = []
for word in nltk.word_tokenize(instring.lower()):
try:
word = "".join(["S", word, "E"]).encode("utf-8")
cword = [c for c in word if not(is_punct(c) or is_number(c))]
ngrams.extend(["".join(x) for x in nltk.ngrams(cword, gram_size)])
except UnicodeDecodeError:
pass
return ngrams
def ngram_distrib(words, gram_size):
tokens = []
for word in words:
tokens.extend(str_to_ngrams(word, gram_size))
return nltk.FreqDist(tokens)
def plot_ngram_distrib(fd, nbest, title, gram_size):
kvs = sorted([(k, fd[k]) for k in fd], key=itemgetter(1), reverse=True)[0:nbest]
ks = [k for k, v in kvs]
vs = [v for k, v in kvs]
plt.plot(np.arange(nbest), vs)
plt.xticks(np.arange(nbest), ks, rotation="90")
plt.title("%d-gram frequency for %s names (Top %d)" %
(gram_size, title, nbest))
plt.xlabel("%d-grams" % (gram_size))
plt.ylabel("Frequency")
plt.show()
###
with open("./data/words.txt") as f:
eng_words = f.read().split()
with open("./data/generic_names.txt") as f:
generic_names = f.read().split()
eng = ngram_distrib(eng_words, GRAM_SIZE)
generic = ngram_distrib(generic_names, GRAM_SIZE)
plot_ngram_distrib(eng, 30, "Eng words", GRAM_SIZE)
plot_ngram_distrib(generic, 30, "Generic drugs", GRAM_SIZE)
###
words = ([(word, 'engWord') for word in eng_words] +
[(word, 'genericDrug') for word in generic_names]
random.shuffle(words)
###
train_words = words[1500:]
devtest_words = words[500:1500]
test_words = words[:500]
###
def word_features(word):
features = {}
i = 0
for ngram in str_to_ngrams(word, 3):
features["ngram"+str(i)] = ngram
i+=1
return features
###
train_set = [(word_features(w), c) for (w,c) in train_words]
devtest_set = [(word_features(w), c) for (w,c) in devtest_words]
test_set = [(word_features(w), c) for (w,c) in test_words]
classifier = nltk.NaiveBayesClassifier.train(train_set)
###
errors = []
for (word, tag) in devtest_words:
guess = classifier.classify(word_features(word))
if guess != tag:
errors.append( (tag, guess, word) )
for (tag, guess, word) in sorted(errors):
print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, word)
###
if __name__ == '__main__':
while True:
print
print "% accurate: ",
print nltk.classify.accuracy(classifier, devtest_set)
print "Hit enter with no input to quit."
query = raw_input("Query:")
if query == '':
break
else:
print classifier.classify(word_features(query))
As you can see, the error analysis shows us >96% on our dev-test set... this example doesnt even bother moving on to the test set.
With just some simple features, here the ngrams, we were able to predict with a high degree of accuracy whether a word was a generic drug or an english word.
--JG