Pandas: Comparison with SQL
这是今年年初在学习 Pandas 时,所写的一篇笔记,当时对 Pandas 还不够熟悉,便借助与 SQL (以 SQLite 为例)进行对比, 理解并熟悉 Pandas 的使用。
# 关于 Pandas 可参看之前的文章——Pandas
0 Prepping
import pandas as pd
import numpy as np
tips = pd.read_csv(r'../data/tips.csv')
tips.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill 244 non-null float64
tip 244 non-null float64
sex 244 non-null object
smoker 244 non-null object
day 244 non-null object
time 244 non-null object
size 244 non-null int64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.4+ KB
tips.head()
import sqlite3
path = '../data/data.db'
# 保存 DataFrame 到 SQLite
def save_sql(df, tablename):
'''
Save Dataframe into SQLite
'''
conn = sqlite3.connect(path)
try:
df.to_sql(tablename, conn, index=False, if_exists='replace')
print('Save Successful.')
except Exception as e:
print('Error:\n\n{0}'.format(e))
conn.close()
# 连接 SQLite 并进行查询操作
def query_sql(query):
'''
Connect SQLite and Query
'''
conn = sqlite3.connect(path)
try:
result = pd.read_sql(query, conn)
print('\nQuery Results:')
return result
except Exception as e:
print('Error:\n\n{0}'.format(e))
conn.close()
# 在 SQLite 中执行游标
def cursor_sql(query):
'''
Executing Cursors in SQLite
'''
conn = sqlite3.connect(path)
cursor = conn.cursor()
cursor.execute(query)
cursor.close()
conn.commit()
conn.close()
# 保存 tips 到数据库
save_sql(tips, 'tips')
Save Successful.
1 SELECT
query = '''
SELECT total_bill, tip, smoker, time
FROM tips
LIMIT 5;
'''
query_sql(query)
Query Results:
tips[['total_bill', 'tip', 'smoker', 'time']].head(5)
2 WHERE
query = '''
SELECT *
FROM tips
WHERE time = 'Dinner'
LIMIT 5;
'''
query_sql(query)
Query Results:
tips[tips['time'] == 'Dinner'].head(5)
is_dinner = tips['time'] == 'Dinner'
is_dinner.value_counts()
True 176
False 68
Name: time, dtype: int64
tips[is_dinner].head(5)
query = '''
-- tips of more than $5.00 at Dinner meals
SELECT *
FROM tips
WHERE time = 'Dinner' AND tip > 5.00;
'''
query_sql(query)
Query Results:
# tips of more than $5.00 at Dinner meals
tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)]
query = '''
-- tips by parties of at least 5 diners OR bill total was more than $45
SELECT *
FROM tips
WHERE size >= 5 OR total_bill > 45;
'''
query_sql(query)
Query Results:
# tips by parties of at least 5 diners OR bill total was more than $45
tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)]
# Create a DataFrame
frame = pd.DataFrame({'col1': ['A', 'B', np.NaN, 'C', 'D'],
'col2': ['F', np.NaN, 'G', 'H', 'I']})
frame
save_sql(frame, 'frame')
Save Successful.
query = '''
-- where col2 IS NULL
SELECT *
FROM frame
WHERE col2 IS NULL;
'''
query_sql(query)
Query Results:
# where col2 IS NULL
frame[frame['col2'].isna()]
query = '''
-- where col1 IS NOT NULL
SELECT *
FROM frame
WHERE col1 IS NOT NULL;
'''
query_sql(query)
Query Results:
# where col1 IS NOT NULL
frame[frame['col1'].notna()]
3 GROUP BY
query = '''
SELECT sex
,count(*)
FROM tips
GROUP BY sex;
'''
query_sql(query)
Query Results:
tips.groupby('sex').size()
sex
Female 87
Male 157
dtype: int64
tips.groupby('sex').count()
tips.groupby('sex')['total_bill'].count()
sex
Female 87
Male 157
Name: total_bill, dtype: int64
query = '''
SELECT day
,AVG(tip)
,COUNT(*)
FROM tips
GROUP BY day;
'''
query_sql(query)
Query Results:
tips.groupby('day').agg({'tip': np.mean, 'day': np.size})
query = '''
SELECT smoker
,day
,COUNT(*)
,AVG(tip)
FROM tips
GROUP BY smoker
,day;
'''
query_sql(query)
Query Results:
tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]})
4 JOIN
4.0 Prepping
df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'],
'value': np.random.randn(4)})
df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'],
'value': np.random.randn(4)})
save_sql(df1, 'df1')
Save Successful.
save_sql(df2, 'df2')
Save Successful.
4.1 INNER JOIN
query = '''
SELECT *
FROM df1
INNER JOIN df2
ON df1.key = df2.key;
'''
query_sql(query)
Query Results:
# merge performs an INNER JOIN by default
pd.merge(df1, df2, on='key')
indexed_df2 = df2.set_index('key')
pd.merge(df1, indexed_df2, left_on='key', right_index=True)
4.2 LEFT OUTER JOIN
query = '''
-- show all records from df1
SELECT *
FROM df1
LEFT OUTER JOIN df2
ON df1.key = df2.key;
'''
query_sql(query)
Query Results:
# show all records from df1
pd.merge(df1, df2, on='key', how='left')
4.3 RIGHT OUTER JOIN
# SQLite cannot RIGHT OUTER JOIN
query = '''
-- show all records from df2
SELECT *
FROM df1
RIGHT OUTER JOIN df2
ON df1.key = df2.key;
'''
query_sql(query)
Error:
Execution failed on sql '
-- show all records from df2
SELECT *
FROM df1
RIGHT OUTER JOIN df2
ON df1.key = df2.key;
': RIGHT and FULL OUTER JOINs are not currently supported
# show all records from df2
pd.merge(df1, df2, on='key', how='right')
4.4 FULL JOIN
# SQLite cannot FULL OUTER JOIN
query = '''
-- show all records from both tables
SELECT *
FROM df1
FULL OUTER JOIN df2
ON df1.key = df2.key;
'''
query_sql(query)
Error:
Execution failed on sql '
-- show all records from both tables
SELECT *
FROM df1
FULL OUTER JOIN df2
ON df1.key = df2.key;
': RIGHT and FULL OUTER JOINs are not currently supported
# show all records from both frames
pd.merge(df1, df2, on='key', how='outer')
5 UNION
city1 = pd.DataFrame({'city': ['Chicago', 'San Francisco', 'New York City'],
'rank': range(1, 4)})
city2 = pd.DataFrame({'city': ['Chicago', 'Boston', 'Los Angeles'],
'rank': [1, 4, 5]})
save_sql(city1, 'city1')
save_sql(city2, 'city2')
Save Successful.
Save Successful.
query = '''
SELECT city
,rank
FROM city1
UNION ALL
SELECT city
,rank
FROM city2;
'''
query_sql(query)
Query Results:
pd.concat([city1, city2])
query = '''
SELECT city
,rank
FROM city1
UNION
SELECT city
,rank
FROM city2;
'''
query_sql(query)
Query Results:
pd.concat([city1, city2]).drop_duplicates()
6 Pandas equivalents for some SQL analytic and aggregate functions
6.1 Top N rows with offset
-- MySQL
SELECT * FROM tips
ORDER BY tip DESC
LIMIT 10 OFFSET 5;
tips.nlargest(10+5, columns='tip').tail(10)
6.2 Top N rows per group
-- Oracle's ROW_NUMBER() analytic function
SELECT * FROM (
SELECT
t.*,
ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn
FROM tips t
)
WHERE rn < 3
ORDER BY day, rn;
(tips.assign(rn=tips.sort_values(['total_bill'], ascending=False)
.groupby(['day'])
.cumcount() + 1)
.query('rn < 3')
.sort_values(['day','rn'])
)
(tips.assign(rnk=tips.groupby(['day'])['total_bill']
.rank(method='first', ascending=False))
.query('rnk < 3')
.sort_values(['day','rnk'])
)
-- Oracle's RANK() analytic function
SELECT * FROM (
SELECT
t.*,
RANK() OVER(PARTITION BY sex ORDER BY tip) AS rnk
FROM tips t
WHERE tip < 2
)
WHERE rnk < 3
ORDER BY sex, rnk;
(tips[tips['tip'] < 2]
.assign(rnk_min=tips.groupby(['sex'])['tip']
.rank(method='min'))
.query('rnk_min < 3')
.sort_values(['sex','rnk_min'])
)
7 UPDATE
query = '''
UPDATE tips
SET tip = tip*2
WHERE tip < 2;
'''
cursor_sql(query)
tips.loc[tips['tip'] < 2, 'tip'] *= 2
8 DELETE
query = '''
DELETE FROM tips
WHERE tip > 9;
'''
cursor_sql(query)
tips = tips.loc[tips['tip'] <= 9]