翻译:杨毅远
校对:吴金笛
本文长度为4400字,建议阅读8分钟
https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/corona_fake.csv
1. from nltk.corpus import stopwords
2. STOPWORDS = set(stopwords.words('english'))
3. from sklearn.feature_extraction.text import CountVectorizer
4.
5. from textblob import TextBlob
6. import plotly.express as px
7. import plotly.figure_factory as ff
8. import plotly.graph_objects as go
9.
10. df = pd.read_csv('data/corona_fake.csv')
11. df.loc[df['label'] == 'Fake', ['label']] = 'FAKE'
12. df.loc[df['label'] == 'fake', ['label']] = 'FAKE'
13. df.loc[df['source'] == 'facebook', ['source']] = 'Facebook'
14.
15. df.loc[5]['label'] = 'FAKE'
16. df.loc[15]['label'] = 'TRUE'
17. df.loc[43]['label'] = 'FAKE'
18. df.loc[131]['label'] = 'TRUE'
19. df.loc[242]['label'] = 'FAKE'
20.
21. df = df.sample(frac=1).reset_index(drop=True)
22. df.label.value_counts()
process_data.py
df.loc[df['label'] == 'TRUE'].source.value_counts()
df.loc[df['label'] == 'FAKE'].source.value_counts()
1. def print_plot(index):
2. example = df[df.index == index][['text','label']].values[0]
3. if len(example) > 0:
4. print(example[0])
5. print('label:', example[1])
6.
7. print_plot(500)
print_plot.py
print_plot(1000)
由于我们数据集中文章内容很清晰,所以我们仅需要删除标点符号并将大写字母改为小写即可。
1. df['text'] = df['text'].str.replace('[^\w\s]','')
2. df['text'] = df['text'].str.lower()
获取每篇新闻的情感得分,而且分数控制在[-1,1]范围内,其中1表示积极情绪,-1表示消极情绪。
获取每篇文章的长度(字数)。
df['polarity'] = df['text'].map(lambda text: TextBlob(text).sentiment.polarity)
def text_len(x):
if type(x) is str:
return len(x.split())
else:
return 0
df['text_len'] = df['text'].apply(text_len)
nums_text = df.query('text_len > 0')['text_len']
fig = ff.create_distplot(hist_data = [nums_text], group_labels = ['Text'])
fig.update_layout(title_text='Distribution of article length', template="plotly_white")
fig.show()
polarity_length.py
图三
1. fig = px.histogram(df, x="text_len", y="text", color="label",
2. marginal="box",
3. hover_data=df.columns, nbins=100)
4. fig.update_layout(title_text='Distribution of article length', template="plotly_white")
5. fig.show()
text_len_hist.py
图四
为了显示不同新闻的文本长度的概率密度,我们使用小提琴图(violin plot)表示:
1. fig = px.violin(df, y='text_len', color='label',
2. violinmode='overlay',
3. hover_data=df.columns, template='plotly_white')
4. fig.show()
text_len_violin.py
图五
Facebook vs. Harvard
平均而言,Facebook的帖子比哈佛健康的文章短得多:
1. df_new = df.loc[(df['source'] == 'Facebook') | (df['source'] == 'https://www.health.harvard.edu/')]
2.
3. fig = px.histogram(df_new, x="text_len", y="text", color='source',
4. marginal="box",
5. hover_data=df_new.columns, nbins=100)
6. fig.update_layout(title_text='Distribution of article length of two sources', template="plotly_white")
7. fig.show()
facebook_harvard_textlen_hist.py
1. fig = px.violin(df_new, y='text_len', color='source',
2. violinmode='overlay',
3. hover_data=df_new.columns, template='plotly_white')
4. fig.show()
图七
情感极性
1. x1 = df.loc[df['label']=='TRUE']['polarity']
2. x2 = df.loc[df['label'] == 'FAKE']['polarity']
3.
4. group_labels = ['TRUE', 'FAKE']
5.
6. colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']
7.
8. fig = ff.create_distplot(
9. [x1, x2], group_labels,colors=colors)
10.
11. fig.update_layout(title_text='polarity', template="plotly_white")
12. fig.show()
label_polarity.py
1. fig = p.violin(df, y='polarity', color="label",
2. violinmode='overlay',
3. template='plotly_white')
4. fig.show()
1. x1 = df.loc[df['source']=='Facebook']['polarity']
2. x2 = df.loc[df['source'] == 'https://www.health.harvard.edu/']['polarity']
3. x3 = df.loc[df['source'] == 'https://www.nytimes.com/']['polarity']
4. x4 = df.loc[df['source'] == 'https://www.naturalnews.com/']['polarity']
5. group_labels = ['Facebook', 'Harvard', 'nytimes', 'naturalnews']
6.
7. colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)', 'rgb(100, 0, 0)', 'rgb(200, 0, 200)']
8.
9. # Create distplot with custom bin_size 10. fig = ff.create_distplot(
11. [x1, x2, x3, x4], group_labels,colors=colors)
12.
13. fig.update_layout(title_text='polarity', template="plotly_white")
14. fig.show()
1. fig = go.Figure()
2.
3. sources = ['https://www.health.harvard.edu/', 'https://www.nytimes.com/', 'Facebook', 'https://www.naturalnews.com/']
4.
5. for source in sources:
6. fig.add_trace(go.Violin(x=df['source'][df['source'] == source],
7. y=df['polarity'][df['source'] == source],
8. name=source,
9. box_visible=True,
10. meanline_visible=True))
11. fig.update_layout(title_text='Polarity of four sources', template='plotly_white')
12. fig.show()
source_violin.py
情绪vs文章长度vs真实性
1. fig = px.density_contour(df, x='polarity', y='text_len', marginal_x='histogram', marginal_y='histogram', template='plotly_white')
2. fig.update_layout(title_text='Sentiment vs. Article length')
3. fig.show()
1. fig = px.scatter(df, x='polarity', y='text_len', color='label', template="plotly_white")
2. fig.update_layout(title_text='Sentiment polarity')
3. fig.show()
polarity_scatter.py
图十三
df.groupby(['source']).mean().sort_values('polarity', ascending=False)
图十四
df.loc[df['source'] == 'RudyGiuliani']['text'][880]
1. common_bigram_true = get_top_n_bigram(df.loc[df['label'] == 'TRUE']['text'], 20)
2. for word, freq in common_bigram_true:
3. print(word, freq)
1. common_bigram_fake = get_top_n_bigram(df.loc[df['label'] == 'FAKE']['text'], 20)
2. for word, freq in common_bigram_fake:
3. print(word, freq)
fake_bigram.py
促进治愈:这包括使用大剂量静脉注射维生素C。
关于起源的推测:这个主题包括声称冠状病毒是在用于生物武器的实验室中制造的,或者是5G技术导致了这种疾病。
关于有影响力人士的谣言:例如比尔·盖茨和福西博士代表制药公司策划了冠状病毒。
应对人们的恐惧:例如梅林达·盖茨基金会和约翰·霍普金斯大学在三个月前通过Event 201预测了冠状病毒。
从我们的数据来看,真实和虚假新闻内容之间的一个明显区别是,虚假新闻似乎更多地使用了人的名字,这表明虚假新闻可能更加个性化。
naturalnews.com vs orthomolecular.org
以上两个新闻来源都提倡阴谋论,但是它们却关注不同的主题。
1. naturalnews_bigram = get_top_n_bigram(df.loc[df['source'] == 'https://www.naturalnews.com/']['text'], 20)
2. for word, freq in naturalnews_bigram:
3. print(word, freq)
natural_bigram.py
1. ortho_bigram = get_top_n_bigram(df.loc[df['source'] == 'http://orthomolecular.org/']['text'], 20)
2. for word, freq in ortho_bigram:
3. print(word, freq)
ortho_bigram.py
原文标题:
Explore COVID-19 Infodemic
原文链接:
https://towardsdatascience.com/explore-covid-19-infodemic-2d1ceaae2306
译者简介
杨毅远,清华大学自动化系研一在读,本科毕业于华中科技大学自动化学院实验班,研究方向为工业过程检测中的AI算法。喜欢唱歌、喜欢接触新鲜事物的我对于“AI+”格外感兴趣;入门CV和数据挖掘的我,希望接触到更多非自己研究领域内的事物,拓宽自己的知识圈。
——END——