In [None]:
import pandas as pd

# author: Susan Li 
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

# Latent Dirichlet Allocation (LDA) is a modeling technique to automatically group 
# a collection of documents into topics so that new documents can then be classified

# The data set is a list of over one million news headlines published over a period of 15 years
data = pd.read_csv('../Data/abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [None]:
# A peek at the data
print(len(documents))
print(documents[:5])

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(3748)

In [None]:
import nltk
nltk.download('wordnet')

# stemmer = PorterStemmer()
stemmer = SnowballStemmer("english")

In [None]:
# lemmatize function: 3rd person changed to 1st person, all verbs changed to present tense
def lemmatize_stemming(text):
 return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [None]:
# Split text into sentences and then into words. 
# Lowercase the words and remove punctuation.
# Remove short words (fewer than 3 characters).
# Remove all stopwords 
def preprocess(text):
 result = []
 for token in gensim.utils.simple_preprocess(text):
 if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
 result.append(lemmatize_stemming(token))
 # result.append(token)
 return result

In [None]:
# Pick a random document, say 4310
doc_sample = documents[documents['index'] == 4310].values[0][0]
print(' Words in the original headline: ')
words = []
for word in doc_sample.split(' '):
 words.append(word)
print(words)

# plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
 # 'died', 'agreed', 'owned', 'humbled', 'sized',
 # 'meeting', 'stating', 'siezing', 'itemization',
 # 'sensational', 'traditional', 'reference', 'colonizer',
 # 'plotted']
# singles = [stemmer.stem(plural) for plural in plurals]
# print(' '.join(singles)) 

print('\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

In [None]:
# Preprocess headlines and save in processed_docs
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

In [None]:
# Create dictionary from ‘processed_docs’ containing word frequency in training set
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
 print(k, v)
 count += 1
 if count > 10:
 break

In [None]:
# create dictionary reporting how many words appear and their frequencies 
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

In [None]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
 print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
 dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

In [None]:
# Create tf-idf model
# then apply transformation to the entire corpus
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
 pprint(doc)
 break

In [None]:
# Train lda model using gensim.models.LdaMulticore
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [None]:
for idx, topic in lda_model.print_topics(-1):
 print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
# Running LDA using TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
 print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
# Next use the model for classification
# Let's try to classify our favorite headline 4310
processed_docs[4310]

In [None]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
 print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

In [None]:
# top answer above has the word "help" from the query document

# Next with the other model
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
 print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

In [None]:
# this model found a topic which had "rain" in it 

unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
 print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))