Python数据分析与机器学习47-维基百科词条EDA
2022-08-05 本文已影响0人
只是甲
一. 数据源介绍
train_1.csv:
维基百科各个词条每天点击量
二. 将浮点型转为整数
浮点型数据更占内存,所以我们可以将浮点型转为整形,减小内存的消耗,从而加快程序运行的速度
代码:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
# 读取数据源
train = pd.read_csv('E:/file/train_1.csv').fillna(0)
print(train.head())
print(train.info())
print("########################################################")
# 浮点数占内存,转为 整数
for col in train.columns[1:]:
train[col] = pd.to_numeric(train[col],downcast='integer')
print(train.head())
print(train.info())
print("########################################################")
测试记录:
Page ... 2016-12-31
0 2NE1_zh.wikipedia.org_all-access_spider ... 20.0
1 2PM_zh.wikipedia.org_all-access_spider ... 20.0
2 3C_zh.wikipedia.org_all-access_spider ... 17.0
3 4minute_zh.wikipedia.org_all-access_spider ... 11.0
4 52_Hz_I_Love_You_zh.wikipedia.org_all-access_s... ... 10.0
[5 rows x 551 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145063 entries, 0 to 145062
Columns: 551 entries, Page to 2016-12-31
dtypes: float64(550), object(1)
memory usage: 609.8+ MB
None
########################################################
Page ... 2016-12-31
0 2NE1_zh.wikipedia.org_all-access_spider ... 20
1 2PM_zh.wikipedia.org_all-access_spider ... 20
2 3C_zh.wikipedia.org_all-access_spider ... 17
3 4minute_zh.wikipedia.org_all-access_spider ... 11
4 52_Hz_I_Love_You_zh.wikipedia.org_all-access_s... ... 10
[5 rows x 551 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145063 entries, 0 to 145062
Columns: 551 entries, Page to 2016-12-31
dtypes: int32(550), object(1)
memory usage: 305.5+ MB
None
########################################################
三. 获取网页的语言
代码:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
# 读取数据源
train = pd.read_csv('E:/file/train_1.csv').fillna(0)
# 浮点数占内存,转为 整数
#for col in train.columns[1:]:
# train[col] = pd.to_numeric(train[col],downcast='integer')
# 获取网页的语言
def get_language(page):
res = re.search('[a-z][a-z].wikipedia.org',page)
#print (res.group()[0:2])
if res:
return res.group()[0:2]
return 'na'
train['lang'] = train.Page.map(get_language)
from collections import Counter
print(Counter(train.lang))
测试记录:
Counter({'en': 24108, 'ja': 20431, 'de': 18547, 'na': 17855, 'fr': 17802, 'zh': 17229, 'ru': 15022, 'es': 14069})
四. 分析不同语言的时间序列
代码:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from collections import Counter
# 读取数据源
train = pd.read_csv('E:/file/train_1.csv').fillna(0)
# 浮点数占内存,转为 整数
#for col in train.columns[1:]:
# train[col] = pd.to_numeric(train[col],downcast='integer')
# 获取网页的语言
def get_language(page):
res = re.search('[a-z][a-z].wikipedia.org',page)
#print (res.group()[0:2])
if res:
return res.group()[0:2]
return 'na'
train['lang'] = train.Page.map(get_language)
# 将不同的语言放到一个列表里
lang_sets = {}
lang_sets['en'] = train[train.lang=='en'].iloc[:,0:-1]
lang_sets['ja'] = train[train.lang=='ja'].iloc[:,0:-1]
lang_sets['de'] = train[train.lang=='de'].iloc[:,0:-1]
lang_sets['na'] = train[train.lang=='na'].iloc[:,0:-1]
lang_sets['fr'] = train[train.lang=='fr'].iloc[:,0:-1]
lang_sets['zh'] = train[train.lang=='zh'].iloc[:,0:-1]
lang_sets['ru'] = train[train.lang=='ru'].iloc[:,0:-1]
lang_sets['es'] = train[train.lang=='es'].iloc[:,0:-1]
sums = {}
for key in lang_sets:
sums[key] = lang_sets[key].iloc[:,1:].sum(axis=0) / lang_sets[key].shape[0]
days = [r for r in range(sums['en'].shape[0])]
# 画图进行分析
fig = plt.figure(1, figsize=[10, 10])
plt.ylabel('Views per Page')
plt.xlabel('Day')
plt.title('Pages in Different Languages')
labels = {'en': 'English', 'ja': 'Japanese', 'de': 'German',
'na': 'Media', 'fr': 'French', 'zh': 'Chinese',
'ru': 'Russian', 'es': 'Spanish'
}
for key in sums:
plt.plot(days, sums[key], label=labels[key])
plt.legend()
plt.show()
测试记录:
我们可以看到英文的明显高于其他语言的
中间凸起的,一般是有热点时间发生,浏览量飞速上升
五. 查看英文下各个词条的时间序列
代码:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from collections import Counter
# 读取数据源
train = pd.read_csv('E:/file/train_1.csv').fillna(0)
# 浮点数占内存,转为 整数
#for col in train.columns[1:]:
# train[col] = pd.to_numeric(train[col],downcast='integer')
# 获取网页的语言
def get_language(page):
res = re.search('[a-z][a-z].wikipedia.org',page)
#print (res.group()[0:2])
if res:
return res.group()[0:2]
return 'na'
train['lang'] = train.Page.map(get_language)
# 将不同的语言放到一个列表里
lang_sets = {}
lang_sets['en'] = train[train.lang=='en'].iloc[:,0:-1]
lang_sets['ja'] = train[train.lang=='ja'].iloc[:,0:-1]
lang_sets['de'] = train[train.lang=='de'].iloc[:,0:-1]
lang_sets['na'] = train[train.lang=='na'].iloc[:,0:-1]
lang_sets['fr'] = train[train.lang=='fr'].iloc[:,0:-1]
lang_sets['zh'] = train[train.lang=='zh'].iloc[:,0:-1]
lang_sets['ru'] = train[train.lang=='ru'].iloc[:,0:-1]
lang_sets['es'] = train[train.lang=='es'].iloc[:,0:-1]
sums = {}
for key in lang_sets:
sums[key] = lang_sets[key].iloc[:,1:].sum(axis=0) / lang_sets[key].shape[0]
days = [r for r in range(sums['en'].shape[0])]
def plot_entry(key, idx):
data = lang_sets[key].iloc[idx, 1:]
fig = plt.figure(1, figsize=(10, 5))
plt.plot(days, data)
plt.xlabel('day')
plt.ylabel('views')
plt.title(train.iloc[lang_sets[key].index[idx], 0])
plt.show()
idx = [1, 5, 10, 50, 100, 250,500, 750,1000,1500,2000,3000,4000,5000]
for i in idx:
plot_entry('en',i)
plt.show()
测试记录:
后面的进行省略
六. 各个语言的热点词条
代码:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from collections import Counter
# 读取数据源
train = pd.read_csv('E:/file/train_1.csv').fillna(0)
# 浮点数占内存,转为 整数
#for col in train.columns[1:]:
# train[col] = pd.to_numeric(train[col],downcast='integer')
# 获取网页的语言
def get_language(page):
res = re.search('[a-z][a-z].wikipedia.org',page)
#print (res.group()[0:2])
if res:
return res.group()[0:2]
return 'na'
train['lang'] = train.Page.map(get_language)
lang_sets = {}
lang_sets['en'] = train[train.lang=='en'].iloc[:,0:-1]
lang_sets['ja'] = train[train.lang=='ja'].iloc[:,0:-1]
lang_sets['de'] = train[train.lang=='de'].iloc[:,0:-1]
lang_sets['na'] = train[train.lang=='na'].iloc[:,0:-1]
lang_sets['fr'] = train[train.lang=='fr'].iloc[:,0:-1]
lang_sets['zh'] = train[train.lang=='zh'].iloc[:,0:-1]
lang_sets['ru'] = train[train.lang=='ru'].iloc[:,0:-1]
lang_sets['es'] = train[train.lang=='es'].iloc[:,0:-1]
sums = {}
for key in lang_sets:
sums[key] = lang_sets[key].iloc[:,1:].sum(axis=0) / lang_sets[key].shape[0]
days = [r for r in range(sums['en'].shape[0])]
npages = 5
top_pages = {}
for key in lang_sets:
print(key)
sum_set = pd.DataFrame(lang_sets[key][['Page']])
sum_set['total'] = lang_sets[key].sum(axis=1)
sum_set = sum_set.sort_values('total',ascending=False)
print(sum_set.head(10))
top_pages[key] = sum_set.index[0]
print('\n\n')
for key in top_pages:
fig = plt.figure(1,figsize=(10,5))
cols = train.columns
cols = cols[1:-1]
data = train.loc[top_pages[key],cols]
plt.plot(days,data)
plt.xlabel('Days')
plt.ylabel('Views')
plt.title(train.loc[top_pages[key],'Page'])
plt.show()
测试记录:
en
Page total
38573 Main_Page_en.wikipedia.org_all-access_all-agents 1.206618e+10
9774 Main_Page_en.wikipedia.org_desktop_all-agents 8.774497e+09
74114 Main_Page_en.wikipedia.org_mobile-web_all-agents 3.153985e+09
39180 Special:Search_en.wikipedia.org_all-access_all... 1.304079e+09
10403 Special:Search_en.wikipedia.org_desktop_all-ag... 1.011848e+09
74690 Special:Search_en.wikipedia.org_mobile-web_all... 2.921628e+08
39172 Special:Book_en.wikipedia.org_all-access_all-a... 1.339931e+08
10399 Special:Book_en.wikipedia.org_desktop_all-agents 1.332859e+08
33644 Main_Page_en.wikipedia.org_all-access_spider 1.290204e+08
34257 Special:Search_en.wikipedia.org_all-access_spider 1.243102e+08
ja
Page total
120336 メインページ_ja.wikipedia.org_all-access_all-agents 210753795.0
86431 メインページ_ja.wikipedia.org_desktop_all-agents 134147415.0
123025 特別:検索_ja.wikipedia.org_all-access_all-agents 70316929.0
89202 特別:検索_ja.wikipedia.org_desktop_all-agents 69215206.0
57309 メインページ_ja.wikipedia.org_mobile-web_all-agents 66459122.0
119609 特別:最近の更新_ja.wikipedia.org_all-access_all-agents 17662791.0
88897 特別:最近の更新_ja.wikipedia.org_desktop_all-agents 17627621.0
119625 真田信繁_ja.wikipedia.org_all-access_all-agents 10793039.0
123292 特別:外部リンク検索_ja.wikipedia.org_all-access_all-agents 10331191.0
89463 特別:外部リンク検索_ja.wikipedia.org_desktop_all-agents 10327917.0
de
Page total
139119 Wikipedia:Hauptseite_de.wikipedia.org_all-acce... 1.603934e+09
116196 Wikipedia:Hauptseite_de.wikipedia.org_mobile-w... 1.112689e+09
67049 Wikipedia:Hauptseite_de.wikipedia.org_desktop_... 4.269924e+08
140151 Spezial:Suche_de.wikipedia.org_all-access_all-... 2.234259e+08
66736 Spezial:Suche_de.wikipedia.org_desktop_all-agents 2.196368e+08
140147 Spezial:Anmelden_de.wikipedia.org_all-access_a... 4.029181e+07
138800 Special:Search_de.wikipedia.org_all-access_all... 3.988154e+07
68104 Spezial:Anmelden_de.wikipedia.org_desktop_all-... 3.535523e+07
68511 Special:MyPage/toolserverhelferleinconfig.js_d... 3.258496e+07
137765 Hauptseite_de.wikipedia.org_all-access_all-agents 3.173246e+07
na
Page total
45071 Special:Search_commons.wikimedia.org_all-acces... 67150638.0
81665 Special:Search_commons.wikimedia.org_desktop_a... 63349756.0
45056 Special:CreateAccount_commons.wikimedia.org_al... 53795386.0
45028 Main_Page_commons.wikimedia.org_all-access_all... 52732292.0
81644 Special:CreateAccount_commons.wikimedia.org_de... 48061029.0
81610 Main_Page_commons.wikimedia.org_desktop_all-ag... 39160923.0
46078 Special:RecentChangesLinked_commons.wikimedia.... 28306336.0
45078 Special:UploadWizard_commons.wikimedia.org_all... 23733805.0
81671 Special:UploadWizard_commons.wikimedia.org_des... 22008544.0
82680 Special:RecentChangesLinked_commons.wikimedia.... 21915202.0
fr
Page total
27330 Wikipédia:Accueil_principal_fr.wikipedia.org_a... 868480667.0
55104 Wikipédia:Accueil_principal_fr.wikipedia.org_m... 611302821.0
7344 Wikipédia:Accueil_principal_fr.wikipedia.org_d... 239589012.0
27825 Spécial:Recherche_fr.wikipedia.org_all-access_... 95666374.0
8221 Spécial:Recherche_fr.wikipedia.org_desktop_all... 88448938.0
26500 Sp?cial:Search_fr.wikipedia.org_all-access_all... 76194568.0
6978 Sp?cial:Search_fr.wikipedia.org_desktop_all-ag... 76185450.0
131296 Wikipédia:Accueil_principal_fr.wikipedia.org_a... 63860799.0
26993 Organisme_de_placement_collectif_en_valeurs_mo... 36647929.0
7213 Organisme_de_placement_collectif_en_valeurs_mo... 36624145.0
zh
Page total
28727 Wikipedia:首页_zh.wikipedia.org_all-access_all-a... 123694312.0
61350 Wikipedia:首页_zh.wikipedia.org_desktop_all-agents 66435641.0
105844 Wikipedia:首页_zh.wikipedia.org_mobile-web_all-a... 50887429.0
28728 Special:搜索_zh.wikipedia.org_all-access_all-agents 48678124.0
61351 Special:搜索_zh.wikipedia.org_desktop_all-agents 48203843.0
28089 Running_Man_zh.wikipedia.org_all-access_all-ag... 11485845.0
30960 Special:链接搜索_zh.wikipedia.org_all-access_all-a... 10320403.0
63510 Special:链接搜索_zh.wikipedia.org_desktop_all-agents 10320336.0
60711 Running_Man_zh.wikipedia.org_desktop_all-agents 7968443.0
30446 瑯琊榜_(電視劇)_zh.wikipedia.org_all-access_all-agents 5891589.0
ru
Page total
99322 Заглавная_страница_ru.wikipedia.org_all-access... 1.086019e+09
103123 Заглавная_страница_ru.wikipedia.org_desktop_al... 7.428800e+08
17670 Заглавная_страница_ru.wikipedia.org_mobile-web... 3.279304e+08
99537 Служебная:Поиск_ru.wikipedia.org_all-access_al... 1.037643e+08
103349 Служебная:Поиск_ru.wikipedia.org_desktop_all-a... 9.866417e+07
100414 Служебная:Ссылки_сюда_ru.wikipedia.org_all-acc... 2.510200e+07
104195 Служебная:Ссылки_сюда_ru.wikipedia.org_desktop... 2.505816e+07
97670 Special:Search_ru.wikipedia.org_all-access_all... 2.437457e+07
101457 Special:Search_ru.wikipedia.org_desktop_all-ag... 2.195847e+07
98301 Служебная:Вход_ru.wikipedia.org_all-access_all... 1.216259e+07
es
Page total
92205 Wikipedia:Portada_es.wikipedia.org_all-access_... 751492304.0
95855 Wikipedia:Portada_es.wikipedia.org_mobile-web_... 565077372.0
90810 Especial:Buscar_es.wikipedia.org_all-access_al... 194491245.0
71199 Wikipedia:Portada_es.wikipedia.org_desktop_all... 165439354.0
69939 Especial:Buscar_es.wikipedia.org_desktop_all-a... 160431271.0
94389 Especial:Buscar_es.wikipedia.org_mobile-web_al... 34059966.0
90813 Especial:Entrar_es.wikipedia.org_all-access_al... 33983359.0
143440 Wikipedia:Portada_es.wikipedia.org_all-access_... 31615409.0
93094 Lali_Espósito_es.wikipedia.org_all-access_all-... 26602688.0
69942 Especial:Entrar_es.wikipedia.org_desktop_all-a... 25747141.0
image.png
image.png
后面的进行省略