Pandas 数据可视化
2019-07-29 本文已影响0人
我是李小胖
基本图形
柱状图
reviews['points'].value_counts().sort_index().plot.bar()
散点图
reviews[reviews['price'] < 100].sample(100).plot.scatter(x='price', y='points')

蜂窝图
reviews[reviews['price'] < 100].plot.hexbin(x='price', y='points', gridsize=15)

大量重复的点可以用这种图表示
柱状图-叠加模式

wine_counts.plot.bar(stacked=True)

面积模式
wine_counts.plot.area()
折线模式
wine_counts.plot.line()
美化
设置图的大小,字体大小,颜色,标题
reviews['points'].value_counts().sort_index().plot.bar(
figsize=(12, 6),
color='mediumvioletred',
fontsize=16,
title='Rankings Given by Wine Magazine',
)
借助Matplotlib
import matplotlib.pyplot as plt
ax = reviews['points'].value_counts().sort_index().plot.bar(
figsize=(12, 6),
color='mediumvioletred',
fontsize=16
)
ax.set_title("Rankings Given by Wine Magazine", fontsize=20)

借助Seaborn-去除边框
import matplotlib.pyplot as plt
import seaborn as sns
ax = reviews['points'].value_counts().sort_index().plot.bar(
figsize=(12, 6),
color='mediumvioletred',
fontsize=16
)
ax.set_title("Rankings Given by Wine Magazine", fontsize=20)
sns.despine(bottom=True, left=True)

多图表
matplotlib
fig, axarr = plt.subplots(2, 2, figsize=(12, 8))
reviews['points'].value_counts().sort_index().plot.bar(
ax=axarr[0][0]
)
reviews['province'].value_counts().head(20).plot.bar(
ax=axarr[1][1]

Seaborn
df = footballers[footballers['Position'].isin(['ST', 'GK'])]
g = sns.FacetGrid(df, col="Position", col_wrap=2)
g.map(sns.kdeplot, "Overall")

df = footballers[footballers['Position'].isin(['ST', 'GK'])]
df = df[df['Club'].isin(['Real Madrid CF', 'FC Barcelona', 'Atlético Madrid'])]
g = sns.FacetGrid(df, row="Position", col="Club")
g.map(sns.violinplot, "Overall")

df = footballers[footballers['Position'].isin(['ST', 'GK'])]
df = df[df['Club'].isin(['Real Madrid CF', 'FC Barcelona', 'Atlético Madrid'])]
g = sns.FacetGrid(df, row="Position", col="Club",
row_order=['GK', 'ST'],
col_order=['Atlético Madrid', 'FC Barcelona', 'Real Madrid CF'])
g.map(sns.violinplot, "Overall")
控制显示顺序
pairplot-多变量的相互关系
sns.pairplot(footballers[['Overall', 'Potential', 'Value']])

颜色,图标参数
sns.lmplot(
x='Value', y='Overall',
markers=['o', 'x', '*'],
hue='Position',
data=footballers.loc[footballers['Position'].isin(
['ST', 'RW', 'LW'])],
fit_reg=False
)

分组
f = (footballers
.loc[footballers['Position'].isin(['ST', 'GK'])]
.loc[:, ['Value', 'Overall', 'Aggression', 'Position']]
)
f = f[f["Overall"] >= 80]
f = f[f["Overall"] < 85]
f['Aggression'] = f['Aggression'].astype(float)
sns.boxplot(x="Overall", y="Aggression", hue='Position', data=f)

总结图
热力图
f = (
footballers.loc[:, ['Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control']]
.applymap(lambda v: int(v) if str.isdecimal(v) else np.nan)
.dropna()
).corr()
sns.heatmap(f, annot=True)

平行线图
from pandas.plotting import parallel_coordinates
f = (
footballers.iloc[:, 12:17]
.loc[footballers['Position'].isin(['ST', 'GK'])]
.applymap(lambda v: int(v) if str.isdecimal(v) else np.nan)
.dropna()
)
f['Position'] = footballers['Position']
f = f.sample(200)
parallel_coordinates(f, 'Position')

Seanborn使用
基本图形
柱状图-值统计
countplot == value_count
sns.countplot(reviews['points'])

折线图-密度图
sns.kdeplot(reviews.query('price < 200').price)

二维密度图--类似蜂窝图作用
样本多,重复点多的时候用
sns.kdeplot(reviews[reviews['price'] < 200].loc[:, ['price', 'points']].dropna().sample(5000))

直方图
类似pandas.hist
sns.distplot(reviews['points'], bins=10, kde=False)

散点图和直方图复合
sns.jointplot(x='price', y='points', data=reviews[reviews['price'] < 100])

蜂窝图和直方图复合
sns.jointplot(x='price', y='points', data=reviews[reviews['price'] < 100], kind='hex',gridsize=20)

箱线图
df = reviews[reviews.variety.isin(reviews.variety.value_counts().head(5).index)]
sns.boxplot(
x='variety',
y='points',
data=df
)

小提琴图
sns.violinplot(
x='variety',
y='points',
data=reviews[reviews.variety.isin(reviews.variety.value_counts()[:5].index)]
)

网络动态图表-plotly
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
散点图
import plotly.graph_objs as go
iplot([go.Scatter(x=reviews.head(1000)['points'], y=reviews.head(1000)['price'], mode='markers')])

热力图
iplot([go.Histogram2dContour(x=reviews.head(500)['points'],
y=reviews.head(500)['price'],
contours=go.Contours(coloring='heatmap')),
go.Scatter(x=reviews.head(1000)['points'], y=reviews.head(1000)['price'], mode='markers')])

图形语法的可视化库plotnine
from plotnine import *
top_wines = reviews[reviews['variety'].isin(reviews['variety'].value_counts().head(5).index)]
df = top_wines.head(1000).dropna()
(ggplot(df)
+ aes('points', 'price')
+ geom_point())
#其他表达形式ggplot(df)
+ geom_point(aes('points', 'price'))
)
(ggplot(df, aes('points', 'price'))
+ geom_point
一层层添加图形参数

df = top_wines.head(1000).dropna()
(
ggplot(df)
+ aes('points', 'price')
+ geom_point()
+ stat_smooth()
)

添加颜色
df = top_wines.head(1000).dropna()
(
ggplot(df)
+ geom_point()
+ aes(color='points')
+ aes('points', 'price')
+ stat_smooth()
)
一图多表
df = top_wines.head(1000).dropna()
(ggplot(df)
+ aes('points', 'price')
+ aes(color='points')
+ geom_point()
+ stat_smooth()
+ facet_wrap('~variety')
)

柱状图
(ggplot(top_wines)
+ aes('points')
+ geom_bar()
)

二维热力图
(ggplot(top_wines)
+ aes('points', 'variety')
+ geom_bin2d(bins=20)
)

更多API文档 API Reference.
处理时间序列
一般柱状图
shelter_outcomes['date_of_birth'].value_counts().sort_values().plot.line()

按年份重新取样
shelter_outcomes['date_of_birth'].value_counts().resample('Y').sum().plot.line()

stocks['volume'].resample('Y').mean().plot.bar()

同期对比
如今年12月和去年12月比较
from pandas.plotting import lag_plot
lag_plot(stocks['volume'].tail(250))

自相关图
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(stocks['volume'])
