LSI 和 LDA 提取川普推特主题

2018-07-29 本文已影响202人 Gaius_Yao

川普号称“推特治国”，这个数据集收集了川普 2015 年 7 月到 2016 年 11 月七千余条推特的内容，并尝试使用 LSI 和 LDA 提取其主题。

0 导入包

# 科学计算
import numpy as np
# 数据处理和导入导出
import pandas as pd

# 数据可视化
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from matplotlib.colors import rgb2hex
plt.style.use('ggplot') #使用 ggplot 主题
# 更好的可视化效果
import seaborn as sns
sns.set_style("whitegrid") #设置 Seaborn 主题
# 创建可视化交互图形
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True) #使用 jupyter notebook 模式

# 词云
from wordcloud import WordCloud
from imageio import imread #读取图片

# 矢量化文本
from sklearn.feature_extraction.text import CountVectorizer
# 相关模型
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import PCA
# 可视化高维数据集
from sklearn.manifold import TSNE

# 使用新的数据类型
from collections import Counter

# 停用词
from nltk.corpus import stopwords
stop = stopwords.words('english')

1 准备数据

导入数据，并将 Tweet_Text 列为 NaN 的行删除。

# 导入数据
df = pd.read_csv('data/Donald-Tweets!.csv', header=0).rename(columns={'twt_favourites_IS_THIS_LIKE_QUESTION_MARK':'Tweet_Like'}).dropna(subset=['Tweet_Text'], axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7375 entries, 0 to 7374
Data columns (total 12 columns):
Date 7375 non-null object
Time 7375 non-null object
Tweet_Text 7375 non-null object
Type 7375 non-null object
Media_Type 1225 non-null object
Hashtags 2031 non-null object
Tweet_Id 7375 non-null float64
Tweet_Url 7375 non-null object
Tweet_Like 7375 non-null int64
Retweets 7375 non-null int64
Unnamed: 10 26 non-null float64
Unnamed: 11 13 non-null float64
dtypes: float64(3), int64(2), object(7)
memory usage: 749.0+ KB

# 查看 5 条样本数据
df.sample(5)

通过查看样本数据，发现第 10、11 列没有列名，且所有值均为 NaN，直接将这两列删除。

# 删除空白的第 10、 11 未命名列
del df['Unnamed: 10']
del df['Unnamed: 11']

# 查看是否删除成功
df.sample()

另外，Tweet_Id 列的所有记录均被转换为了科学计数法，而根据 Tweet 的 URL 命名规则，实际上 URL 最后一部分就是 Tweet_Id。因此，将 Tweet_Url 中的 Id 提取出来并替换 Tweet_Id。

# 替换 Tweet ID
df['Tweet_Id'] = df['Tweet_Url'].str[43:]

df.sample()

2 词云

在进行主题提取前，先来看看川普推特内容的词云和常用词（出现频率最高的单词）。

# 设置并更新停用词
stopwords = set(stopwords.words('english'))
stopwords.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '@', '#', 'rt', 'amp', 'realdonaldtrump', 'http', 'https', '/', '://', '_', 'co', 'trump', 'donald', 'makeamericagreatagain'])

# 设置遮罩
mask = imread('img/trump.jpg')

# 将 Tweet_Text 中所有内容变成字符串
twt_text = ''.join(pd.Series(df['Tweet_Text'].tolist()).astype(str)).lower()

type(twt_text)

str

# 生成词云
cloud = WordCloud(
        background_color = 'white',
        stopwords = stopwords,
        mask = mask,
        max_words = 1024,
        max_font_size = 100
    )
    
word_cloud = cloud.generate(twt_text)
word_cloud.to_file('output\Trump_Cloud.jpg')

plt.figure(figsize=(12,12))
plt.imshow(word_cloud) 
plt.axis('off');

3 常用词

# 矢量化文本
countVectorizer = CountVectorizer(stop_words=stopwords)
vectorizedText = countVectorizer.fit_transform(df['Tweet_Text'].str.replace("'", '').values)
print('Shape Vectorized Text: {}'.format(vectorizedText.shape))

Shape Vectorized Text: (7375, 13690)

# 设置常用词数量
n = 20

def nMostFrequentWords(n, countVectorizer, vectorizedText):    
    """
        得出最常见的单词及其出现的次数
    
        Args:
            n: n most frequent words, int
            countVectorizer: CountVectorizer
            vectorizedText: vectorized text, string
            
        Returns:
            words: most frequent words, list
            wordCounts: count word appearences, list   
    """
    # 计算单词在文本中出现的次数
    vectorizedCount = np.sum(vectorizedText, axis=0)
    
    # 获取单词索引和计数
    wordIndices = np.flip(np.argsort(vectorizedCount), 1)
    wordCounts = np.flip(np.sort(vectorizedCount),1)

    # 创建单词向量
    wordVectors = np.zeros((n, vectorizedText.shape[1]))
    for i in range(n):
        wordVectors[i, wordIndices[0,i]] = 1

    # 逆转换单词向量
    words = [word[0].encode('ascii').decode('utf-8') for word in countVectorizer.inverse_transform(wordVectors)]

    # 返回最常见的单词及其出现的次数
    return (words, wordCounts[0, :n].tolist()[0])

words, wordCounts = nMostFrequentWords(n=n, countVectorizer=countVectorizer, vectorizedText=vectorizedText)

# 创建色彩映射
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# 生成柱状图
data = go.Bar(x = words,
              y = wordCounts,
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent {} Words In Trump Tweet'.format(n),
                   xaxis = dict(title = 'Words'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

接下来将使用 LSI 和 LDA 进行主题提取，关于这两个模型的原理和实现，以后有机会再讲述。

4 LSI

# 设置主题数
nTopics = 8

# 创建  LSI 模型
lsiModel = TruncatedSVD(n_components=nTopics)
lsiTopicMatrix = lsiModel.fit_transform(vectorizedText)
print('Shape LSI Topic Matrix: {}'.format(lsiTopicMatrix.shape))

Shape LSI Topic Matrix: (7375, 8)

# 获取最可能的键和所有类别的计数
lsiKeys = lsiTopicMatrix.argmax(axis=1)
lsiCategories, lsiCounts = zip(*Counter(lsiKeys).items())

def getTopWords(n, lsiKeys, vectorizedText, countVectorizer):
    """
        获得主题下的关键词
    
        Args:
            n: n top words, int
            lsiKeys: most probable keys, ndarray
            countVectorizer: CountVectorizer
            vectorizedText: vectorized text, csr_matrix
            
        Returns:
            topWords: top n words, list
    """
    # 为平均值创建零数组
    wordMean = np.zeros((nTopics, vectorizedText.shape[1]))
    # 迭代每个主题
    for i in np.unique(lsiKeys):
        wordMean[i] += vectorizedText.toarray()[lsiKeys==i].mean(axis=0)
        
    # 为每个主题排序并获得最常用的 n 个单词
    topWordsIndices = np.flip(np.argsort(wordMean, axis=1)[:, -n:], axis=1)
    topWordsPercentage = (np.divide(np.flip(np.sort(wordMean, axis=1)[:, -n:], axis=1), (np.sum(wordMean, axis=1)+0.0000001)[:, None])*100).astype(int)
    
     # 存储所有主题的所有单词
    topWords = []

    # 使用其索引迭代主题
    for i, (topic, percentage) in enumerate(zip(topWordsIndices, topWordsPercentage)):
        # Store all words for one topic
        topicWords = []

        if i in np.unique(lsiKeys):
            # 迭代主题的索引
            for index, percent in zip(topic, percentage):
                # 为索引创建一个wordvector
                wordVector = np.zeros((vectorizedText.shape[1]))
                wordVector[index] = 1
                # 反变换单词向量
                word = countVectorizer.inverse_transform(wordVector)[0][0]
                topicWords.append('{}% '.format(percent) + word.encode('ascii').decode('utf-8'))
        # 存储主题的所有单词
        topWords.append(', '.join(topicWords))

    return topWords

topWords = getTopWords(5, lsiKeys, vectorizedText, countVectorizer)

for i, words in enumerate(topWords):
    print('Topic {}: {}'.format(i, words))

Topic 0: 5% great, 4% thank, 1% america, 1% make, 1% trump2016
Topic 1: 19% hillaryforprison2016, 19% williamesammon1, 19% whereshillary, 19% rueu1ctbz8, 19% sleeping
Topic 2: 2% hillary, 1% clinton, 1% crooked, 0% president, 0% said
Topic 3: 1% new, 1% poll, 1% cruz, 1% people, 1% big
Topic 4: 6% poll, 4% new, 3% trump2016, 2% america, 1% join
Topic 5: 4% trump2016, 1% people, 1% join, 1% tomorrow, 0% us
Topic 6: 2% get, 2% america, 2% vote, 1% like, 0% time
Topic 7: 2% foxnews, 1% cnn, 1% tonight, 1% enjoy, 1% interviewed

# 排序
lsiCategoriesSorted, lsiCountsSorted = zip(*sorted(zip(lsiCategories, lsiCounts)))

# 创建标签
topWords = getTopWords(5, lsiKeys, vectorizedText, countVectorizer)
labels = ['Topic {}'.format(i) for i in lsiCategoriesSorted]

# 创建色彩映射
n = nTopics
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# 生成柱状图
data = go.Bar(x = labels,
              y = lsiCountsSorted,
              text = [word for word in topWords if word],
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent LSI Topics In Trump Tweet',
                   xaxis = dict(title = 'Topic'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

# 降维高维数据集，使其在 2D 中可视化
tsneModel = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsneModelVectors = tsneModel.fit_transform(lsiTopicMatrix)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 7375 samples in 0.006s...
[t-SNE] Computed neighbors for 7375 samples in 0.605s...
[t-SNE] Computed conditional probabilities for sample 1000 / 7375
[t-SNE] Computed conditional probabilities for sample 2000 / 7375
[t-SNE] Computed conditional probabilities for sample 3000 / 7375
[t-SNE] Computed conditional probabilities for sample 4000 / 7375
[t-SNE] Computed conditional probabilities for sample 5000 / 7375
[t-SNE] Computed conditional probabilities for sample 6000 / 7375
[t-SNE] Computed conditional probabilities for sample 7000 / 7375
[t-SNE] Computed conditional probabilities for sample 7375 / 7375
[t-SNE] Mean sigma: 0.016840
[t-SNE] KL divergence after 250 iterations with early exaggeration: 65.068428
[t-SNE] Error after 2000 iterations: 0.829952

# 创建色彩映射
n = nTopics
cmap = get_cmap('tab10')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# 获取关键词
topWords = getTopWords(3, lsiKeys, vectorizedText, countVectorizer)


# 生成散点图
data = []
# 迭代每个主题
for topic in range(nTopics):
    mask = lsiKeys==topic
    sample_mask = np.zeros(mask.sum()).astype(bool)
    sample_mask[:int(1000/nTopics)] = True
    np.random.shuffle(sample_mask)
    
    scatter = go.Scatter(x = tsneModelVectors[mask,0][sample_mask],
                         y = tsneModelVectors[mask,1][sample_mask],
                         name = 'Topic {}: {}'.format(topic, topWords[topic]),
                         mode = 'markers',
                         text = df[mask]['Tweet_Text'][sample_mask],
                         marker = dict(color = colors[topic]))
    data.append(scatter)

layout = go.Layout(title = 't-SNE Clustering of {} LSI Topics'.format(nTopics),
                   showlegend=True,
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

5 LDA

# 创建 LDA 模型
ldaModel = LatentDirichletAllocation(n_components=nTopics, learning_method='online', random_state=0, verbose=0)
ldaTopicMatrix = ldaModel.fit_transform(vectorizedText)
print('Shape LSI Topic Matrix: {}'.format(ldaTopicMatrix.shape))

Shape LSI Topic Matrix: (7375, 8)

# 获取最可能的键和所有类别的计数
ldaKeys = ldaTopicMatrix.argmax(axis=1)
ldaCategories, ldaCounts = zip(*Counter(ldaKeys).items())

# 获取关键词
topWords = getTopWords(5, ldaKeys, vectorizedText, countVectorizer)

# 打印主题及其关键词
for i, words in enumerate(topWords):
    print('Topic {}: {}'.format(i, words))

Topic 0: 1% hillary, 1% clinton, 0% crooked, 0% people, 0% president
Topic 1: 2% trump2016, 2% great, 1% tonight, 1% thank, 1% enjoy
Topic 2: 1% great, 0% join, 0% thank, 0% country, 0% tomorrow
Topic 3: 0% thanks, 0% gop, 0% many, 0% america, 0% hillaryclinton
Topic 4: 1% hillary, 1% cruz, 0% poll, 0% ted, 0% cnn
Topic 5: 3% thank, 2% great, 2% new, 1% poll, 1% trump2016
Topic 6: 4% great, 3% america, 2% make, 1% thank, 0% trump2016
Topic 7: 1% thank, 1% trump2016, 1% like, 0% great, 0% see

# 排序
ldaCategoriesSorted, ldaCountsSorted = zip(*sorted(zip(ldaCategories, ldaCounts)))

# 创建标签
topWords = getTopWords(5, ldaKeys, vectorizedText, countVectorizer)
labels = ['Topic {}'.format(i) for i in ldaCategoriesSorted]

# 创建色彩映射
n = nTopics
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# 生成柱状图
data = go.Bar(x = labels,
              y = ldaCountsSorted,
              text = [word for word in topWords if word],
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent LDA Topics In Trump Tweet',
                   xaxis = dict(title = 'Topic'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

# 降维高维数据集，使其在 2D 中可视化
tsneModel = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsneModelVectors = tsneModel.fit_transform(ldaTopicMatrix)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 7375 samples in 0.011s...
[t-SNE] Computed neighbors for 7375 samples in 0.737s...
[t-SNE] Computed conditional probabilities for sample 1000 / 7375
[t-SNE] Computed conditional probabilities for sample 2000 / 7375
[t-SNE] Computed conditional probabilities for sample 3000 / 7375
[t-SNE] Computed conditional probabilities for sample 4000 / 7375
[t-SNE] Computed conditional probabilities for sample 5000 / 7375
[t-SNE] Computed conditional probabilities for sample 6000 / 7375
[t-SNE] Computed conditional probabilities for sample 7000 / 7375
[t-SNE] Computed conditional probabilities for sample 7375 / 7375
[t-SNE] Mean sigma: 0.068215
[t-SNE] KL divergence after 250 iterations with early exaggeration: 76.528473
[t-SNE] Error after 2000 iterations: 1.265168

# 创建色彩映射
n = nTopics
cmap = get_cmap('tab10')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# 获取关键词
topWords = getTopWords(3, ldaKeys, vectorizedText, countVectorizer)

# 生成散点图
data = []
# 迭代每个主题
for topic in range(nTopics):
    mask = ldaKeys==topic
    sample_mask = np.zeros(mask.sum()).astype(bool)
    sample_mask[:int(1000/nTopics)] = True
    np.random.shuffle(sample_mask)
    
    scatter = go.Scatter(x = tsneModelVectors[mask,0][sample_mask],
                         y = tsneModelVectors[mask,1][sample_mask],
                         name = 'Topic {}: {}'.format(topic, topWords[topic]),
                         mode = 'markers',
                         text = df[mask]['Tweet_Text'][sample_mask],
                         marker = dict(color = colors[topic]))
    data.append(scatter)

layout = go.Layout(title = 't-SNE Clustering of {} LDA Topics'.format(nTopics),
                   showlegend=True,
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

6 PCA

# 创建  PCA 模型
pcaModel = PCA(n_components=nTopics, random_state=0)
pcaTopicMatrix = pcaModel.fit_transform(vectorizedText.toarray())
print('Shape PCA Topic Matrix: {}'.format(pcaTopicMatrix.shape))

Shape PCA Topic Matrix: (7375, 8)

# 获取最可能的键和所有类别的计数
pcaKeys = pcaTopicMatrix.argmax(axis=1)
pcaCategories, pcaCounts = zip(*Counter(pcaKeys).items())

# 获取关键词数量
topWords = getTopWords(5, pcaKeys, vectorizedText, countVectorizer)

# 打印主题及其关键词
for i, words in enumerate(topWords):
    print('Topic {}: {}'.format(i, words))

Topic 0: 10% great, 2% america, 2% make, 2% thank, 0% people
Topic 1: 11% thank, 3% trump2016, 1% support, 1% nice, 0% new
Topic 2: 6% hillary, 4% clinton, 2% crooked, 0% bad, 0% bernie
Topic 3: 3% poll, 3% new, 1% cruz, 1% big, 1% debate
Topic 4: 1% america, 0% join, 0% jeb, 0% president, 0% make
Topic 5: 8% trump2016, 1% tomorrow, 1% join, 0% job, 0% danscavino
Topic 6: 2% people, 1% get, 1% like, 0% vote, 0% time
Topic 7: 2% foxnews, 2% cnn, 1% tonight, 1% enjoy, 1% interviewed

# 排序
pcaCategoriesSorted, pcaCountsSorted = zip(*sorted(zip(pcaCategories, pcaCounts)))

# 创建标签
topWords = getTopWords(5, pcaKeys, vectorizedText, countVectorizer)
labels = ['Topic {}'.format(i) for i in pcaCategoriesSorted]

# 创建色彩映射
n = nTopics
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# 生成柱状图
data = go.Bar(x = labels,
              y = pcaCountsSorted,
              text = [word for word in topWords if word],
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent PCA Topics In Trump Tweet',
                   xaxis = dict(title = 'Topic'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

# 降维高维数据集，使其在 2D 中可视化
tsneModel = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsneModelVectors = tsneModel.fit_transform(pcaTopicMatrix)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 7375 samples in 0.010s...
[t-SNE] Computed neighbors for 7375 samples in 0.681s...
[t-SNE] Computed conditional probabilities for sample 1000 / 7375
[t-SNE] Computed conditional probabilities for sample 2000 / 7375
[t-SNE] Computed conditional probabilities for sample 3000 / 7375
[t-SNE] Computed conditional probabilities for sample 4000 / 7375
[t-SNE] Computed conditional probabilities for sample 5000 / 7375
[t-SNE] Computed conditional probabilities for sample 6000 / 7375
[t-SNE] Computed conditional probabilities for sample 7000 / 7375
[t-SNE] Computed conditional probabilities for sample 7375 / 7375
[t-SNE] Mean sigma: 0.018791
[t-SNE] KL divergence after 250 iterations with early exaggeration: 65.240501
[t-SNE] Error after 2000 iterations: 0.823488

# 创建色彩映射
n = nTopics
cmap = get_cmap('tab10')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# 获取关键词
topWords = getTopWords(3, pcaKeys, vectorizedText, countVectorizer)


# 生成散点图
data = []
# 迭代每个主题
for topic in range(nTopics):
    mask = pcaKeys==topic
    sample_mask = np.zeros(mask.sum()).astype(bool)
    sample_mask[:int(1000/nTopics)] = True
    np.random.shuffle(sample_mask)
    
    scatter = go.Scatter(x = tsneModelVectors[mask,0][sample_mask],
                         y = tsneModelVectors[mask,1][sample_mask],
                         name = 'Topic {}: {}'.format(topic, topWords[topic]),
                         mode = 'markers',
                         text = df[mask]['Tweet_Text'][sample_mask],
                         marker = dict(color = colors[topic]))
    data.append(scatter)

layout = go.Layout(title = 't-SNE Clustering of {} PCA Topics'.format(nTopics),
                   showlegend=True,
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)