{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# author: Susan Li \n", "# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24\n", "\n", "# Latent Dirichlet Allocation (LDA) is a modeling technique to automatically group \n", "# a collection of documents into topics so that new documents can then be classified\n", "\n", "# The data set is a list of over one million news headlines published over a period of 15 years\n", "data = pd.read_csv('../Data/abcnews-date-text.csv', error_bad_lines=False);\n", "data_text = data[['headline_text']]\n", "data_text['index'] = data_text.index\n", "documents = data_text" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# A peek at the data\n", "print(len(documents))\n", "print(documents[:5])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import gensim\n", "from gensim.utils import simple_preprocess\n", "from gensim.parsing.preprocessing import STOPWORDS\n", "from nltk.stem import WordNetLemmatizer, SnowballStemmer\n", "from nltk.stem.porter import *\n", "import numpy as np\n", "np.random.seed(3748)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "nltk.download('wordnet')\n", "\n", "# stemmer = PorterStemmer()\n", "stemmer = SnowballStemmer(\"english\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# lemmatize function: 3rd person changed to 1st person, all verbs changed to present tense\n", "def lemmatize_stemming(text):\n", " return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Split text into sentences and then into words. \n", "# Lowercase the words and remove punctuation.\n", "# Remove short words (fewer than 3 characters).\n", "# Remove all stopwords \n", "def preprocess(text):\n", " result = []\n", " for token in gensim.utils.simple_preprocess(text):\n", " if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:\n", " result.append(lemmatize_stemming(token))\n", " # result.append(token)\n", " return result" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Pick a random document, say 4310\n", "doc_sample = documents[documents['index'] == 4310].values[0][0]\n", "print(' Words in the original headline: ')\n", "words = []\n", "for word in doc_sample.split(' '):\n", " words.append(word)\n", "print(words)\n", "\n", "# plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',\n", " # 'died', 'agreed', 'owned', 'humbled', 'sized',\n", " # 'meeting', 'stating', 'siezing', 'itemization',\n", " # 'sensational', 'traditional', 'reference', 'colonizer',\n", " # 'plotted']\n", "# singles = [stemmer.stem(plural) for plural in plurals]\n", "# print(' '.join(singles)) \n", "\n", "print('\\n tokenized and lemmatized document: ')\n", "print(preprocess(doc_sample))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Preprocess headlines and save in processed_docs\n", "processed_docs = documents['headline_text'].map(preprocess)\n", "processed_docs[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create dictionary from ‘processed_docs’ containing word frequency in training set\n", "dictionary = gensim.corpora.Dictionary(processed_docs)\n", "count = 0\n", "for k, v in dictionary.iteritems():\n", " print(k, v)\n", " count += 1\n", " if count > 10:\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create dictionary reporting how many words appear and their frequencies \n", "bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]\n", "bow_corpus[4310]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bow_doc_4310 = bow_corpus[4310]\n", "for i in range(len(bow_doc_4310)):\n", " print(\"Word {} (\\\"{}\\\") appears {} time.\".format(bow_doc_4310[i][0], \n", " dictionary[bow_doc_4310[i][0]], \n", "bow_doc_4310[i][1]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create tf-idf model\n", "# then apply transformation to the entire corpus\n", "from gensim import corpora, models\n", "tfidf = models.TfidfModel(bow_corpus)\n", "corpus_tfidf = tfidf[bow_corpus]\n", "from pprint import pprint\n", "for doc in corpus_tfidf:\n", " pprint(doc)\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Train lda model using gensim.models.LdaMulticore\n", "lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for idx, topic in lda_model.print_topics(-1):\n", " print('Topic: {} \\nWords: {}'.format(idx, topic))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Running LDA using TF-IDF\n", "lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)\n", "for idx, topic in lda_model_tfidf.print_topics(-1):\n", " print('Topic: {} Word: {}'.format(idx, topic))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Next use the model for classification\n", "# Let's try to classify our favorite headline 4310\n", "processed_docs[4310]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):\n", " print(\"\\nScore: {}\\t \\nTopic: {}\".format(score, lda_model.print_topic(index, 10)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# top answer above has the word \"help\" from the query document\n", "\n", "# Next with the other model\n", "for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):\n", " print(\"\\nScore: {}\\t \\nTopic: {}\".format(score, lda_model_tfidf.print_topic(index, 10)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# this model found a topic which had \"rain\" in it \n", "\n", "unseen_document = 'How a Pentagon deal became an identity crisis for Google'\n", "bow_vector = dictionary.doc2bow(preprocess(unseen_document))\n", "for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):\n", " print(\"Score: {}\\t Topic: {}\".format(score, lda_model.print_topic(index, 5)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }