In [1]:
import re
import json
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
In [2]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
/Users/chenjosh/anaconda3/lib/python3.6/site-packages/_pytest/fixtures.py:844: DeprecationWarning: The `convert` argument is deprecated in favor of `converter`.  It will be removed after 2019/01.
  params = attr.ib(convert=attr.converters.optional(tuple))
/Users/chenjosh/anaconda3/lib/python3.6/site-packages/_pytest/fixtures.py:846: DeprecationWarning: The `convert` argument is deprecated in favor of `converter`.  It will be removed after 2019/01.
  ids = attr.ib(default=None, convert=_ensure_immutable_ids)
In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
In [5]:
cands = pd.read_csv('../data/candidates.txt', header=None)
cands = cands[0].to_list()
print(cands)

uri_re = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
twi_re = re.compile(r"pic.twitter.com/(?:[a-zA-Z]|[0-9])+")

def remove_uri(tweet):
    try:
        tweet = re.sub(uri_re, "", tweet)
        tweet = re.sub(twi_re, "", tweet)
    except TypeError:
#         print('error in tweet content', tweet)
        tweet = ""
    return tweet

# return string
def get_recent_tweets_string(name, n=-1):
    df_1 = pd.read_json(f"../data/{name}.json", lines=True)
    print("%s's %s tweets" % (name, len(df_1['tweet'])))
    return "\n".join(list(map(remove_uri, df_1['tweet'][0:n].to_list())))

def get_recent_tweets_list(name, n=-1):
    df_1 = pd.read_json(f"../data/{name}.json", lines=True)
    print("%s's %s tweets" % (name, len(df_1['tweet'])))
    return list(map(remove_uri, df_1['tweet'][0:n].to_list()))

# ==== tokenizer ====
import unicodedata

def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False


def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False


def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

def _clean_text(text):
    """Performs invalid character removal and whitespace cleanup on text."""
    output = []
    for char in text:
        cp = ord(char)
        if cp == 0 or cp == 0xfffd or _is_control(char):
            continue
        if _is_whitespace(char):
            output.append(" ")
        else:
            output.append(char)
    return "".join(output)

def _run_split_on_punc(text):
    """Splits punctuation on a piece of text."""
    chars = list(text)
    i = 0
    start_new_word = True
    output = []
    while i < len(chars):
        char = chars[i]
        if _is_punctuation(char):
            output.append([char])
            start_new_word = True
        else:
            if start_new_word:
                output.append([])
            start_new_word = False
            output[-1].append(char)
        i += 1

    return ["".join(x) for x in output]
    
def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
      return []
    tokens = text.split()
    return tokens

def tokenize(text):
    """Tokenizes a piece of text."""
    text = _clean_text(text)
    orig_tokens = whitespace_tokenize(text)
    split_tokens = []
    for token in orig_tokens:
        token = token.lower()
#       token = self._run_strip_accents(token)
        split_tokens.extend(_run_split_on_punc(token))

    return split_tokens
    
def lexical_diversity(text):
    data = tokenize(text)
    word_count = len(data) or 1
    vocab_size = len(set(data))
    diversity_score = vocab_size / word_count
    return [vocab_size, word_count, diversity_score]

# get_recent_tweets_list(cands[4])[10709]
lex = [lexical_diversity(get_recent_tweets_string(cand)) for cand in cands]
# docs_raw = [get_recent_tweets_string(name, 500) for name in cands]
['andrewYang', 'ewarren', 'BetoORourke', 'BernieSanders', 'KamalaHarris', 'BilldeBlasio', 'JohnDelaney', 'PeteButtigieg', 'Hickenlooper', 'GovernorBullock', 'MichaelBennet', 'amyklobuchar', 'TulsiGabbard', 'JayInslee', 'JulianCastro', 'CoryBooker', 'JoeBiden', 'TimRyan', 'SenGillibrand', 'marwilliamson']
andrewYang's 7545 tweets
ewarren's 4553 tweets
BetoORourke's 5372 tweets
BernieSanders's 10324 tweets
KamalaHarris's 10728 tweets
BilldeBlasio's 1374 tweets
JohnDelaney's 7136 tweets
PeteButtigieg's 6006 tweets
Hickenlooper's 3288 tweets
GovernorBullock's 6484 tweets
MichaelBennet's 1608 tweets
amyklobuchar's 8644 tweets
TulsiGabbard's 6849 tweets
JayInslee's 3593 tweets
JulianCastro's 3837 tweets
CoryBooker's 54161 tweets
JoeBiden's 1826 tweets
TimRyan's 3464 tweets
SenGillibrand's 17052 tweets
marwilliamson's 15993 tweets
In [6]:
for i, l in enumerate(lex):
    print("%s use %s words in %s tokens, diversity(#words/#tokens) is %s \n" % (cands[i], l[0], l[1], l[2]) )
andrewYang use 10831 words in 161361 tokens, diversity(#words/#tokens) is 0.06712278679482651 

ewarren use 9075 words in 156134 tokens, diversity(#words/#tokens) is 0.058123150627025506 

BetoORourke use 10710 words in 167616 tokens, diversity(#words/#tokens) is 0.06389604810996563 

BernieSanders use 10308 words in 270010 tokens, diversity(#words/#tokens) is 0.03817636383837636 

KamalaHarris use 12765 words in 313707 tokens, diversity(#words/#tokens) is 0.04069083571613002 

BilldeBlasio use 4001 words in 39415 tokens, diversity(#words/#tokens) is 0.10150957757199036 

JohnDelaney use 10902 words in 201060 tokens, diversity(#words/#tokens) is 0.054222620113398984 

PeteButtigieg use 11068 words in 140258 tokens, diversity(#words/#tokens) is 0.0789117198305979 

Hickenlooper use 9527 words in 81686 tokens, diversity(#words/#tokens) is 0.1166295326004456 

GovernorBullock use 10697 words in 158768 tokens, diversity(#words/#tokens) is 0.06737503779099063 

MichaelBennet use 4779 words in 45567 tokens, diversity(#words/#tokens) is 0.10487853051550464 

amyklobuchar use 15582 words in 253842 tokens, diversity(#words/#tokens) is 0.06138464083957738 

TulsiGabbard use 13182 words in 179134 tokens, diversity(#words/#tokens) is 0.07358737034845422 

JayInslee use 7407 words in 95064 tokens, diversity(#words/#tokens) is 0.0779159303206261 

JulianCastro use 9145 words in 95807 tokens, diversity(#words/#tokens) is 0.09545231559280637 

CoryBooker use 50689 words in 1247901 tokens, diversity(#words/#tokens) is 0.040619408110098475 

JoeBiden use 5048 words in 54958 tokens, diversity(#words/#tokens) is 0.09185195967829979 

TimRyan use 7603 words in 90832 tokens, diversity(#words/#tokens) is 0.08370398097586754 

SenGillibrand use 16356 words in 484350 tokens, diversity(#words/#tokens) is 0.03376896872096624 

marwilliamson use 15118 words in 370622 tokens, diversity(#words/#tokens) is 0.04079088667159532 

In [4]:
def plot(name, recent=None):
    docs_raw = get_recent_tweets_list(name, recent)
    tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                    stop_words = 'english',
                                    lowercase = True,
                                    token_pattern = r'\b[a-zA-Z]{3,}\b',
                                    max_df = 0.5, 
                                    min_df = 10)
    dtm_tf = tf_vectorizer.fit_transform(docs_raw)
#     print(dtm_tf.shape)

    tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
    dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
#     print(dtm_tfidf.shape)

    # for TF DTM
    lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
    lda_tf.fit(dtm_tf)
    # for TFIDF DTM
    lda_tfidf = LatentDirichletAllocation(n_components=10, random_state=0)
    lda_tfidf.fit(dtm_tfidf)
    return tf_vectorizer, lda_tf, dtm_tf
In [13]:
tf_vectorizer, lda_tf, dtm_tf = plot(cands[16])
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
JoeBiden's 1826 tweets
Out[13]:
In [14]:
tf_vectorizer, lda_tf, dtm_tf = plot(cands[1])
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
ewarren's 4553 tweets
Out[14]:
In [15]:
tf_vectorizer, lda_tf, dtm_tf = plot(cands[2])
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
BetoORourke's 5372 tweets
Out[15]:
In [16]:
tf_vectorizer, lda_tf, dtm_tf = plot(cands[3])
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
BernieSanders's 10324 tweets
Out[16]:
In [17]:
tf_vectorizer, lda_tf, dtm_tf = plot(cands[4])
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
KamalaHarris's 10728 tweets
Out[17]:
In [18]:
tf_vectorizer, lda_tf, dtm_tf = plot(cands[7])
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
PeteButtigieg's 6006 tweets
Out[18]:
In [19]:
tf_vectorizer, lda_tf, dtm_tf = plot(cands[11])
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
amyklobuchar's 8644 tweets
Out[19]:
In [20]:
tf_vectorizer, lda_tf, dtm_tf = plot(cands[7])
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
PeteButtigieg's 6006 tweets
Out[20]:
In [21]:
tf_vectorizer, lda_tf, dtm_tf = plot(cands[14])
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
JulianCastro's 3837 tweets
Out[21]:
In [22]:
tf_vectorizer, lda_tf, dtm_tf = plot(cands[15])
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
CoryBooker's 54161 tweets
Out[22]:
In [23]:
tf_vectorizer, lda_tf, dtm_tf = plot(cands[0])
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
andrewYang's 7545 tweets
Out[23]: