呆鸟的Python数据分析Python 数据科学笔记

Pandas: Comparison with SQL

2018-07-28  本文已影响34人  Gaius_Yao

  这是今年年初在学习 Pandas 时,所写的一篇笔记,当时对 Pandas 还不够熟悉,便借助与 SQL (以 SQLite 为例)进行对比, 理解并熟悉 Pandas 的使用。

# 关于 Pandas 可参看之前的文章——Pandas

0 Prepping

import pandas as pd
import numpy as np
tips = pd.read_csv(r'../data/tips.csv')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill 244 non-null float64
tip 244 non-null float64
sex 244 non-null object
smoker 244 non-null object
day 244 non-null object
time 244 non-null object
size 244 non-null int64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.4+ KB

tips.head()
import sqlite3

path = '../data/data.db'

# 保存 DataFrame 到 SQLite
def save_sql(df, tablename):
    '''
         Save Dataframe into SQLite
    '''
    conn = sqlite3.connect(path)
    
    try:
        df.to_sql(tablename, conn, index=False, if_exists='replace')
        print('Save Successful.')
    except Exception as e:
        print('Error:\n\n{0}'.format(e))
        
    conn.close()

#  连接 SQLite 并进行查询操作
def query_sql(query):
    '''
        Connect SQLite and Query
    '''
    conn = sqlite3.connect(path)
    
    try:
        result = pd.read_sql(query, conn)
        print('\nQuery Results:')
        return result
    except Exception as e:
        print('Error:\n\n{0}'.format(e))
    
    conn.close()

# 在 SQLite 中执行游标    
def cursor_sql(query):
    '''
         Executing Cursors in SQLite
    '''
    conn = sqlite3.connect(path)
    
    cursor = conn.cursor()
    cursor.execute(query)
    cursor.close()
    
    conn.commit()
    conn.close()
# 保存 tips 到数据库
save_sql(tips, 'tips')

Save Successful.

1 SELECT

query = '''
SELECT total_bill, tip, smoker, time
FROM tips
LIMIT 5;
'''

query_sql(query)

Query Results:

tips[['total_bill', 'tip', 'smoker', 'time']].head(5)

2 WHERE

query = '''
SELECT *
FROM tips
WHERE time = 'Dinner'
LIMIT 5;
'''

query_sql(query)

Query Results:

tips[tips['time'] == 'Dinner'].head(5)
is_dinner = tips['time'] == 'Dinner'
is_dinner.value_counts()

True 176
False 68
Name: time, dtype: int64

tips[is_dinner].head(5)
query = '''
-- tips of more than $5.00 at Dinner meals
SELECT *
FROM tips
WHERE time = 'Dinner' AND tip > 5.00;
'''

query_sql(query)

Query Results:

# tips of more than $5.00 at Dinner meals
tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)] 
query = '''
-- tips by parties of at least 5 diners OR bill total was more than $45
SELECT *
FROM tips
WHERE size >= 5 OR total_bill > 45;
'''

query_sql(query)

Query Results:

# tips by parties of at least 5 diners OR bill total was more than $45
tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)]
# Create a DataFrame
frame = pd.DataFrame({'col1': ['A', 'B', np.NaN, 'C', 'D'],
                      'col2': ['F', np.NaN, 'G', 'H', 'I']})
frame
save_sql(frame, 'frame')
Save Successful.
query = '''
-- where col2 IS NULL
SELECT *
FROM frame
WHERE col2 IS NULL;
'''

query_sql(query)

Query Results:

# where col2 IS NULL
frame[frame['col2'].isna()]
query = '''
-- where col1 IS NOT NULL
SELECT *
FROM frame
WHERE col1 IS NOT NULL;
'''

query_sql(query)

Query Results:

# where col1 IS NOT NULL
frame[frame['col1'].notna()]

3 GROUP BY

query = '''
SELECT sex
    ,count(*)
FROM tips
GROUP BY sex;
'''

query_sql(query)

Query Results:

tips.groupby('sex').size()

sex
Female 87
Male 157
dtype: int64

tips.groupby('sex').count()
tips.groupby('sex')['total_bill'].count()

sex
Female 87
Male 157
Name: total_bill, dtype: int64

query = '''
SELECT day
    ,AVG(tip)
    ,COUNT(*)
FROM tips
GROUP BY day;
'''

query_sql(query)

Query Results:

tips.groupby('day').agg({'tip': np.mean, 'day': np.size})
query = '''
SELECT smoker
    ,day
    ,COUNT(*)
    ,AVG(tip)
FROM tips
GROUP BY smoker
    ,day;
'''

query_sql(query)

Query Results:

tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]})

4 JOIN

4.0 Prepping

df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'],
                    'value': np.random.randn(4)})
df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'],
                    'value': np.random.randn(4)})
save_sql(df1, 'df1')

Save Successful.

save_sql(df2, 'df2')

Save Successful.

4.1 INNER JOIN

query = '''
SELECT *
FROM df1
INNER JOIN df2
    ON df1.key = df2.key;
'''

query_sql(query)

Query Results:

# merge performs an INNER JOIN by default
pd.merge(df1, df2, on='key')
indexed_df2 = df2.set_index('key')
pd.merge(df1, indexed_df2, left_on='key', right_index=True)

4.2 LEFT OUTER JOIN

query = '''
-- show all records from df1
SELECT *
FROM df1
LEFT OUTER JOIN df2
    ON df1.key = df2.key;
'''

query_sql(query)

Query Results:

# show all records from df1
pd.merge(df1, df2, on='key', how='left')

4.3 RIGHT OUTER JOIN

# SQLite cannot RIGHT OUTER JOIN
query = '''
-- show all records from df2
SELECT *
FROM df1
RIGHT OUTER JOIN df2
    ON df1.key = df2.key;
'''

query_sql(query)

Error:

Execution failed on sql '
-- show all records from df2
SELECT *
FROM df1
RIGHT OUTER JOIN df2
ON df1.key = df2.key;
': RIGHT and FULL OUTER JOINs are not currently supported

# show all records from df2
pd.merge(df1, df2, on='key', how='right')

4.4 FULL JOIN

# SQLite cannot FULL OUTER JOIN
query = '''
-- show all records from both tables
SELECT *
FROM df1
FULL OUTER JOIN df2
  ON df1.key = df2.key;
'''

query_sql(query)

Error:

Execution failed on sql '
-- show all records from both tables
SELECT *
FROM df1
FULL OUTER JOIN df2
ON df1.key = df2.key;
': RIGHT and FULL OUTER JOINs are not currently supported

# show all records from both frames
pd.merge(df1, df2, on='key', how='outer')

5 UNION

city1 = pd.DataFrame({'city': ['Chicago', 'San Francisco', 'New York City'],
                    'rank': range(1, 4)})
city2 = pd.DataFrame({'city': ['Chicago', 'Boston', 'Los Angeles'],
                    'rank': [1, 4, 5]})
save_sql(city1, 'city1')
save_sql(city2, 'city2')

Save Successful.
Save Successful.

query = '''
SELECT city
    ,rank
FROM city1
UNION ALL
SELECT city
    ,rank
FROM city2;
'''

query_sql(query)

Query Results:

pd.concat([city1, city2])
query = '''
SELECT city
    ,rank
FROM city1
UNION
SELECT city
    ,rank
FROM city2;
'''

query_sql(query)

Query Results:

pd.concat([city1, city2]).drop_duplicates()

6 Pandas equivalents for some SQL analytic and aggregate functions

6.1 Top N rows with offset

-- MySQL
SELECT * FROM tips
ORDER BY tip DESC
LIMIT 10 OFFSET 5;
tips.nlargest(10+5, columns='tip').tail(10)

6.2 Top N rows per group

-- Oracle's ROW_NUMBER() analytic function
SELECT * FROM (
  SELECT
    t.*,
    ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn
  FROM tips t
)
WHERE rn < 3
ORDER BY day, rn;
(tips.assign(rn=tips.sort_values(['total_bill'], ascending=False)
                         .groupby(['day'])
                         .cumcount() + 1)
          .query('rn < 3')
          .sort_values(['day','rn'])
    )
 (tips.assign(rnk=tips.groupby(['day'])['total_bill']
                          .rank(method='first', ascending=False))
          .query('rnk < 3')
          .sort_values(['day','rnk'])
     )
-- Oracle's RANK() analytic function
SELECT * FROM (
  SELECT
    t.*,
    RANK() OVER(PARTITION BY sex ORDER BY tip) AS rnk
  FROM tips t
  WHERE tip < 2
)
WHERE rnk < 3
ORDER BY sex, rnk;
(tips[tips['tip'] < 2]
          .assign(rnk_min=tips.groupby(['sex'])['tip']
                              .rank(method='min'))
          .query('rnk_min < 3')
          .sort_values(['sex','rnk_min'])
     )

7 UPDATE

query = '''
UPDATE tips
SET tip = tip*2
WHERE tip < 2;
'''

cursor_sql(query)
tips.loc[tips['tip'] < 2, 'tip'] *= 2

8 DELETE

query = '''
DELETE FROM tips
WHERE tip > 9;
'''

cursor_sql(query)
tips = tips.loc[tips['tip'] <= 9]
上一篇下一篇

猜你喜欢

热点阅读