pandas学习笔记(1)
import pandas as pd
基础
-
创建
image.png
pd.Series([30, 35, 40], index=['2015 Sales', '2016 Sales', '2017 Sales'], name='Product A')
-
读取
- wine_reviews = pd.read_csv("../input/wine-reviews/winemag-data-130k-v2.csv")
有很多参数 可以有index_col=0
wine_reviews.shape
wine_reviews.head()
- 加载xls文件中的一张表
wic = pd.read_excel("../input/publicassistance/xls_files_all/WICAgencies2013ytd.xls", sheet_name='Total Women')
*SQL
import sqlite3
conn = sqlite3.connect("../input/188-million-us-wildfires/FPA_FOD_20170508.sqlite")
fires = pd.read_sql_query("SELECT * FROM fires", conn)
- 写
- wine_reviews.head().to_csv("wine_reviews.csv")
- wic.to_excel('wic.xlsx', sheet_name='Total Women')
- conn = sqlite3.connect("fires.sqlite")
fires.head(10).to_sql("fires", conn)
- 其他,更多见简单手册 Cheat Sheet
删除
排序
NA空白填充 s.add(s3, fill_value=0)
pd.set_option("display.max_rows", 5)
选择
- 基于index (row-first, column-second)
reviews.iloc[0] 获取row 行
reviews.iloc[:, 0] 获取列column
综合使用 reviews.iloc[:3, -5:]
- 基于标签 label-based(row-first, column-second)
reviews.loc[:, ['taster_name', 'taster_twitter_handle', 'points']]
loc 0:10,返回11个,0-10而不是0-9
重设置index
reviews.set_index("title")
reviews.loc[(reviews.country == 'Italy') & (reviews.points >= 90)]
reviews.loc[reviews.country.isin(['Italy', 'France'])]
reviews.loc[reviews.price.notnull()] # isnull()
range(start, stop[, step])
获取统计信息
.describe()
reviews.taster_name.unique()
reviews.points.mean()
reviews.taster_name.value_counts() # 对于字符串,计算不同种类的出现次数
map apply
对于series
review_points_mean = reviews.points.mean()
reviews.points.map(lambda p: p - review_points_mean)
对于dataframe
def remean_points(srs):
srs.points = srs.points - review_points_mean
return srs
reviews.apply(remean_points, axis='columns')
多减一(默认执行map/broadcast)
review_points_mean = reviews.points.mean()
reviews.points - review_points_mean
字符串
reviews.country + " - " + reviews.region_1
df.columns = [col.replace(' ', '_').lower() for col in df.columns]
reviews.groupby('points').points.count() 等价于 reviews['points'].value_counts() (挺有用的)
reviews.groupby('points').price.min()
reviews.groupby(['country', 'province']).apply(lambda df: df.loc[df.points.argmax()]) # 各产地points最大的商品
countries_reviewed.reset_index()
后加内容
相关矩阵
correlation_dataframe.corr()
分箱
image.png