{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# author: Susan Li \n",
    "# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24\n",
    "\n",
    "# Latent Dirichlet Allocation (LDA) is a modeling technique to automatically group \n",
    "# a collection of documents into topics so that new documents can then be classified\n",
    "\n",
    "# The data set is a list of over one million news headlines published over a period of 15 years\n",
    "data = pd.read_csv('../Data/abcnews-date-text.csv', error_bad_lines=False);\n",
    "data_text = data[['headline_text']]\n",
    "data_text['index'] = data_text.index\n",
    "documents = data_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# A peek at the data\n",
    "print(len(documents))\n",
    "print(documents[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gensim\n",
    "from gensim.utils import simple_preprocess\n",
    "from gensim.parsing.preprocessing import STOPWORDS\n",
    "from nltk.stem import WordNetLemmatizer, SnowballStemmer\n",
    "from nltk.stem.porter import *\n",
    "import numpy as np\n",
    "np.random.seed(3748)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "nltk.download('wordnet')\n",
    "\n",
    "# stemmer = PorterStemmer()\n",
    "stemmer = SnowballStemmer(\"english\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# lemmatize function: 3rd person changed to 1st person, all verbs changed to present tense\n",
    "def lemmatize_stemming(text):\n",
    "    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split text into sentences and then into words. \n",
    "# Lowercase the words and remove punctuation.\n",
    "# Remove short words (fewer than 3 characters).\n",
    "# Remove all stopwords \n",
    "def preprocess(text):\n",
    "    result = []\n",
    "    for token in gensim.utils.simple_preprocess(text):\n",
    "        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:\n",
    "            result.append(lemmatize_stemming(token))\n",
    "            # result.append(token)\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pick a random document, say 4310\n",
    "doc_sample = documents[documents['index'] == 4310].values[0][0]\n",
    "print(' Words in the original headline: ')\n",
    "words = []\n",
    "for word in doc_sample.split(' '):\n",
    "    words.append(word)\n",
    "print(words)\n",
    "\n",
    "# plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',\n",
    "           # 'died', 'agreed', 'owned', 'humbled', 'sized',\n",
    "           # 'meeting', 'stating', 'siezing', 'itemization',\n",
    "           # 'sensational', 'traditional', 'reference', 'colonizer',\n",
    "           # 'plotted']\n",
    "# singles = [stemmer.stem(plural) for plural in plurals]\n",
    "# print(' '.join(singles)) \n",
    "\n",
    "print('\\n tokenized and lemmatized document: ')\n",
    "print(preprocess(doc_sample))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preprocess headlines and save in processed_docs\n",
    "processed_docs = documents['headline_text'].map(preprocess)\n",
    "processed_docs[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dictionary from ‘processed_docs’ containing word frequency in training set\n",
    "dictionary = gensim.corpora.Dictionary(processed_docs)\n",
    "count = 0\n",
    "for k, v in dictionary.iteritems():\n",
    "    print(k, v)\n",
    "    count += 1\n",
    "    if count > 10:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create dictionary reporting how many words appear and their frequencies \n",
    "bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]\n",
    "bow_corpus[4310]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bow_doc_4310 = bow_corpus[4310]\n",
    "for i in range(len(bow_doc_4310)):\n",
    "    print(\"Word {} (\\\"{}\\\") appears {} time.\".format(bow_doc_4310[i][0], \n",
    "                                               dictionary[bow_doc_4310[i][0]], \n",
    "bow_doc_4310[i][1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create tf-idf model\n",
    "# then apply transformation to the entire corpus\n",
    "from gensim import corpora, models\n",
    "tfidf = models.TfidfModel(bow_corpus)\n",
    "corpus_tfidf = tfidf[bow_corpus]\n",
    "from pprint import pprint\n",
    "for doc in corpus_tfidf:\n",
    "    pprint(doc)\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train lda model using gensim.models.LdaMulticore\n",
    "lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for idx, topic in lda_model.print_topics(-1):\n",
    "    print('Topic: {} \\nWords: {}'.format(idx, topic))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Running LDA using TF-IDF\n",
    "lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)\n",
    "for idx, topic in lda_model_tfidf.print_topics(-1):\n",
    "    print('Topic: {} Word: {}'.format(idx, topic))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Next use the model for classification\n",
    "# Let's try to classify our favorite headline 4310\n",
    "processed_docs[4310]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):\n",
    "    print(\"\\nScore: {}\\t \\nTopic: {}\".format(score, lda_model.print_topic(index, 10)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# top answer above has the word \"help\" from the query document\n",
    "\n",
    "# Next with the other model\n",
    "for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):\n",
    "    print(\"\\nScore: {}\\t \\nTopic: {}\".format(score, lda_model_tfidf.print_topic(index, 10)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# this model found a topic which had \"rain\" in it \n",
    "\n",
    "unseen_document = 'How a Pentagon deal became an identity crisis for Google'\n",
    "bow_vector = dictionary.doc2bow(preprocess(unseen_document))\n",
    "for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):\n",
    "    print(\"Score: {}\\t Topic: {}\".format(score, lda_model.print_topic(index, 5)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}