5.pandas进阶(SQL对照,不定时更新)
2018-07-10 本文已影响85人
B0ZZ
本章开始,进入实战
0 实验数据
环境:mysql+python 3.6
表名:EMP
CREATE TABLE if not exists EMP(empno int,ename VARCHAR(10),job VARCHAR(9),mgr int,hiredate varchar(10),sal double,comm double,deptno int);
INSERT INTO EMP VALUES (7369, 'SMITH', 'CLERK', 7902,'1980-12-17', 800, NULL, 20);
INSERT INTO EMP VALUES (7499, 'ALLEN', 'SALESMAN', 7698,'1981-02-10', 1600, 300, 30);
INSERT INTO EMP VALUES (7521, 'WARD', 'SALESMAN', 7698,'1981-02-22', 1250, 500, 30);
INSERT INTO EMP VALUES (7566, 'JONES', 'MANAGER', 7839,'1981-04-02', 2975, NULL, 20);
INSERT INTO EMP VALUES (7654, 'MARTIN', 'SALESMAN', 7698,'1981-09-28', 1250, 1400, 30);
INSERT INTO EMP VALUES (7698, 'BLAKE', 'MANAGER', 7839,'1981-05-01', 2850, NULL, 30);
INSERT INTO EMP VALUES (7782, 'CLARK', 'MANAGER', 7839,'1981-07-09', 2450, NULL, 10);
INSERT INTO EMP VALUES (7788, 'SCOTT', 'ANALYST', 7566,'1982-12-09', 3000, NULL, 20);
INSERT INTO EMP VALUES (7839, 'KING', 'PRESIDENT', NULL,'1981-11-17', 5000, NULL, 10);
INSERT INTO EMP VALUES (7844, 'TURNER', 'SALESMAN', 7698,'1981-09-08', 1500, 0, 30);
INSERT INTO EMP VALUES (7876, 'ADAMS', 'CLERK', 7788,'1983-01-12', 1100, NULL, 20);
INSERT INTO EMP VALUES (7900, 'JAMES', 'CLERK', 7698,'1981-12-03', 950, NULL, 30);
INSERT INTO EMP VALUES (7902, 'FORD', 'ANALYST', 7566,'1981-12-03', 3000, NULL, 20);
INSERT INTO EMP VALUES (7934, 'MILLER', 'CLERK', 7782,'1982-11-23', 1300, NULL, 10);
1 SQL和pandas对照
#连接mysql数据库
from sqlalchemy import create_engine
ce="mysql+mysqldb://password@ip:port/dbname?charset=utf8"
from_conn = create_engine(ce, echo=True, encoding='utf-8')
df = pd.read_sql_query("select * from emp",con=from_conn)
#--emp表的sal*10
#select a.sal*10 from emp a;
df["sal"] = df["sal"] * df["sal"] / df["sal"] * 10
#--emp表各部门的sal的平均值,总和,max和min(遇到缺失数值时,需特殊处理)
#select a.deptno,avg(a.sal) as avg_sal,sum(a.sal) as sum_sal,max(a.sal) as max_sal,min(a.sal) as min_sal from emp a group by a.deptno;
grouped = df.groupby(["deptno"])
grouped["sal"].agg([np.mean, np.sum, np.max, np.min]).reset_index()
#--emp表中sal<1000的信息
#select * from emp where sal<1000;
df = df[df["sal"]<1000]
#--ename中含有A的信息
#select ename from emp where ename like'%A%';
df = df.ix[df["ename"].str.contains("A"),"ename"]
#--emp中ename不含有A的信息
#select * from emp where ename not like'%A%';
df = df[~df["ename"].str.contains("A")]
#--计算各部门中薪资小于5000的员工的工资总和
#select deptno,sum(sal) from emp where sal<5000 group by deptno;
df = df[df["sal"]<5000].groupby(["deptno"]).agg({"sal":np.sum}).reset_index()
#--计算各部门中工资不高于其部门平均工资的工资总和
#select a.deptno,sum(a.sal) from emp as a join (select c.deptno, avg(c.sal) as avg_sal from emp as c group by c.deptno) as b where a.deptno=b.deptno and a.sal<b.avg_sal group by a.deptno;
dfg = df.groupby(["deptno"]).agg({"sal":np.mean}).reset_index()
dfg = dfg.rename(columns={"sal":"avg_sal"})
df = pd.merge(left=df,right=dfg,on="deptno", how="inner")
df = df[df["sal"]<df["avg_sal"]].groupby(["deptno"]).agg({"sal":np.sum})
#--计算各部门中工资小于4000的员工的平均工资
#select a.deptno, avg(a.sal) from (select b.deptno, b.sal from emp as b where b.sal < 4000) as a group by a.deptno;
df = df[df["sal"]<4000].groupby(["deptno"]).agg({"sal":np.mean}).reset_index()
#--查询薪水低于10000的员工姓名和sal
#select ename,sal from emp where sal<10000;
df = df.ix[df["sal"]<10000,["ename","sal"]]
#--查询工资大于1000,并且是20号部门的员工的全部信息
#select * from emp where sal>1000 and deptno=20;
df = df[df["sal"]>1000 & df["deptno"]==20]
#--根据用户名"SMITH"查找他所在的部门
#select deptno from emp where ename='SMITH';
df = df.ix[df["ename"].isin(["SMITH"]),"deptno"]
#--查询每个部门的最高工资的人员名称
#select e.ename,e.deptno,e.sal from (select deptno as did ,max(sal) as m from emp group by deptno) s,emp e where e.sal=s.m and s.did=e.deptno;
df_max = df.groupby(["deptno"]).agg({"sal":np.max}).reset_index()
df_max = df_max.rename(columns={"sal":"sal_max"})
df = pd.merge(left=df, right=df_max, on="deptno", how="inner")
df = df.ix[df["sal"]==df["sal_max"],["ename","deptno","sal"]]
#--查询“Jones”之后第一个进入公司的人
#select * from emp where hiredate=(select min(hiredate) from emp where hiredate>(select hiredate from emp where ename='JONES')) ;
df = df.sort_values(by=["ename","hiredate"])
df = df.groupby(["ename"]).head(1).reset_index()
hiredate = df.ix[df["ename"].isin(["JONES"]),"hiredate"].tolist()[0]
df = df[df["hiredate"]>hiredate ]
df = df.sort_values(by=["hiredate"])
df = df.head(1)
#--查找工资最高的部门名称和工资最低的部门名称及工资
#select d.dname,e.sal from emp e,dept d where e.deptno=d.deptno and sal=(select max(m) from (select deptno,max(sal) as m from emp e group by deptno) s) union select d.dname,e.sal from emp e,dept d where e.deptno=d.deptno and sal=(select min(m) from (select deptno,min(sal) as m from emp e group by deptno) s);
df = df.sort_values(by=["sal"],)
df = pd.concat([df.head(1),df.tail(1)])
df = df[["deptno","sal"]]