pandas sql使用笔记
2019-12-21 本文已影响0人
叫我老村长
刚接触的pandas时候,感觉使用 pandasql 更加方便点。现在原生方式用多了也觉得灵活性更大。
# 引入
import pandas as pd
import numpy as np
import pymysql
# 数据集创建
df = pd.DataFrame(datalist) # dict
# sql
connection = pymysql.connect(host='192.168.1.1', user='root', password='root',
db='db1',charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
global connection
sql = "SELECT * FROM server"
df = pd.read_sql(sql,con=connection)
df = pd.read_sql(sql,con=connection, index_col='id')
df = df.fillna("无")
# 存表
# pd.io.sql.write_sql(df,'tablename',connection) 无法使用
# pd.io.sql.to_sql(df, "tablename", conn,flavor='mysql',if_exists='append')
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/lib?charset=utf8')
df.to_sql("tablename", engine, if_exists='append',index=False)
# 速度太慢试试这个
connection = engine.raw_connection() #engine 是 from sqlalchemy import create_engine
cursor = connection.cursor()
# null value become ''
cursor.copy_from(output,table_name,null='')
connection.commit()
cursor.close()
# 结果转list
df['col'].tolist()
# # 结果转dict
# tmp = df.T.to_dict()
# tmp = tmp.values()
# tmp = list(tmp)
# return tmp
# tmp = df_all.T.to_dict()
# tmp_list= sorted(tmp.items(), key=lambda d:d[0])
# atmp = []
# for x in tmp_list:
# atmp.append(x[1])
# return atmp
user_dict = {12: {'Category 1': {'att_1': 1, 'att_2': 'whatever'},
'Category 2': {'att_1': 23, 'att_2': 'another'}},
15: {'Category 1': {'att_1': 10, 'att_2': 'foo'},
'Category 2': {'att_1': 30, 'att_2': 'bar'}}}
pd.DataFrame.from_dict({(i,j): user_dict[i][j]
for i in user_dict.keys()
for j in user_dict[i].keys()},
orient='index')
'''
att_1 att_2
12 Category 1 1 whatever
Category 2 23 another
15 Category 1 10 foo
Category 2 30 bar
'''
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_dict.html
df.to_dict(orient='records') # list 数组
df.to_dict(orient='split') # list 名字 + data
with connection.cursor() as cursor:
try:
cursor.execute(sql)
row = cursor.fetchone()
rows = cursor.fetchall()
except Exception as e:
print(e,sql)
connection.commit()
return row
# np
dates = pd.data_range('20170101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, cloumns=['a','b','c','d']) # dict
# excel
df = pd.read_excel(path+'/data.xlsx', sheet_name=0, index_col=None, na_values=['NA'], encoding='utf-8')
#df = pd.read_excel(filefullpath, sheetname=[0,2],skiprows=[0])
#sheetname指定为读取几个sheet,sheet数目从0开始
#如果sheetname=[0,2],那代表读取第0页和第2页的sheet
#skiprows=[0]代表读取跳过的行数第0行,不写代表不跳过标题
#df = pd.read_excel(filefullpath, sheetname=None ,skiprows=[0])
print df
print type(df)
#若果有多页,type(df)就为<type 'dict'>
#如果就一页,type(df)就为<class 'pandas.core.frame.DataFrame'>
#{0:dataframe,1:dataframe,2:dataframe}
# csv
pwd = os.getcwd()
os.chdir(os.path.dirname(trainFile))
trainData = pd.read_csv(os.path.basename(trainFile))
os.chdir(pwd)
df = pd.read_csv(path+'/data.csv', index_col=None, na_values=['NA'], encoding='utf-8')
# 超大文件的处理
df = pd.read_csv(path+'/data.csv', index_col=None, na_values=['NA'], encoding='utf-8',iterator = True)
chunk = df.get_chunk(5) #看看前五个
df.info(memory_usage='deep') #看看总体信息
# 或者分批处理
# usecols 制定列 dtype 改类型 object 列转换为 Category 内存变小
reader =pd.read_csv(path+'/data.csv', index_col=None, na_values=['NA'], encoding='utf-8',iterator = True)
chunkSize = 100000
loop = True
chunks = []
while loop:
try:
chunk = reader.get_chunk(chunkSize)
chunks.append(chunk)
except StopIteration:
loop = False
print("Iteration is stopped.")
df = pd.concat(chunks, ignore_index=True)
print(df)
# data
df = pd.read_table(path+'/data.dat', index_col=None, na_values=['NA'], encoding='utf-8')
# tocsv
df.to_csv(path+'/data.csv', encoding='utf-8', index=False)
# http://pandas.pydata.org/pandas-docs/stable/io.html
df.to_html()
'''
pd.read_csv(filename):从CSV文件导入数据
pd.read_table(filename):从限定分隔符的文本文件导入数据
pd.read_excel(filename):从Excel文件导入数据
pd.read_sql(query, connection_object):从SQL表/库导入数据
pd.read_json(json_string):从JSON格式的字符串导入数据
pd.read_html(url):解析URL、字符串或者HTML文件,抽取其中的tables表格
pd.read_clipboard():从你的粘贴板获取内容,并传给read_table()
pd.DataFrame(dict):从字典对象导入数据,Key是列名,Value是数据
df.to_csv(filename):导出数据到CSV文件
df.to_excel(filename):导出数据到Excel文件
df.to_sql(table_name, connection_object):导出数据到SQL表
df.to_json(filename):以Json格式导出数据到文本文件
'''
'''
pd.DataFrame(np.random.rand(20,5)):创建20行5列的随机数组成的DataFrame对象
pd.Series(my_list):从可迭代对象my_list创建一个Series对象
df.index = pd.date_range('1900/1/30', periods=df.shape[0]):增加一个日期索引
'''
# 数据反转
df[::-1].reset_index(drop=True)
# 查看数据集
df.dtypes # 数据类型
df_hour['count']= df_hour['count'].astype(float) # 换数据类型
df.head df.tail
'''
df.head(n):查看DataFrame对象的前n行
df.tail(n):查看DataFrame对象的最后n行
df.shape:查看行数和列数
http://df.info():查看索引、数据类型和内存信息
df.describe():查看数值型列的汇总统计
s.value_counts(dropna=False):查看Series对象的唯一值和计数
df.apply(pd.Series.value_counts):查看DataFrame对象中每一列的唯一值和计数
'''
df.index df.index = range(1,len(df)+1) #重建索引
# 排序
df = df.sort_values('count', ascending=False)
df = df.sort_values(by=['ip','origservice'])
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html?highlight=sort_values#pandas.DataFrame.sort_values
# 子集
df = df[['title', 'filename', 'from', 'to']]
# 选择数据
df['cloumns']
# 切片
df[1:3] df['indexstart':'indexend']
# 子集 loc通过标签选择数据,iloc通过位置选择数据
df.loc['indexstart':'indexend',['c1','c2']]
# 提取一个值
df.loc['indexs','c1']
# 第一行 series
df.iloc[0]
# 子集
df.iloc[1:2,3:5]
df.iloc[[1,3,5],:]
# 游标提取一个值
df.iat[1,1]
# 列取值
df.loc[1index,'col']
df['col'].iloc[0]
'''
df[col]:根据列名,并以Series的形式返回列
df[[col1, col2]]:以DataFrame形式返回多列
s.iloc[0]:按位置选取数据
s.loc['index_one']:按索引选取数据
df.iloc[0,:]:返回第一行
df.iloc[0,0]:返回第一列的第一个元素
'''
#筛选
# http://pandas.pydata.org/pandas-docs/stable/comparison_with_sql.html
SELECT total_bill, tip, smoker, time FROM tips LIMIT 5;
tips[['total_bill', 'tip', 'smoker', 'time']].head(5)
SELECT * FROM tips WHERE time = 'Dinner' LIMIT 5;
is_dinner = tips[tips['time'] == 'Dinner'].head(5)
is_dinner.value_counts()
nlargest(3) #最大的三条
SELECT * FROM tips WHERE time = 'Dinner' AND tip > 5.00;
tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)]
SELECT * FROM tips WHERE size >= 5 OR total_bill > 45;
tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)]
SELECT * FROM tips WHERE size in (1,2,3,4,5);
inlist = [1,2,3,4,5]
tips['size'].isin(inlist)
tips = tips.loc[(tips['size'] == 1) | (tips['size'] == 2)]
~tips['size'].isin(inlist) # not in
tips[~tips['size'].isin(inlist)]
tips.dropna(how='all')
SELECT * FROM frame WHERE col2 IS NULL;
frame[frame['col2'].isnull()]
SELECT * FROM frame WHERE col1 IS NOT NULL;
frame[frame['col1'].notnull()]
df_downfile = self.df[~self.df['filename'].isnull() & ((self.df['trace_t']!='downfile_http') | (self.df['trace_t']=='web_url'))]
SELECT sex, count(*) FROM tips GROUP BY sex;
tips.groupby('sex').size()
# count 返回每一个值
tips.groupby('sex')['total_bill'].count().reset_index(name='counts')
df.groupby(['to'])['to'].transform('count') #每条后面都跟着大小
SELECT day, AVG(tip), COUNT(*) FROM tips GROUP BY day;
tips.groupby('day').agg({'tip': np.mean, 'day': np.size})
# 可以一次应用多个功能。 例如,假设我们希望看到提示量与星期几不同 - agg()允许您将字典传递到分组的DataFrame,指示哪些函数应用于特定的列。
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.groupby.DataFrameGroupBy.agg.html?highlight=agg#pandas.core.groupby.DataFrameGroupBy.agg
SELECT smoker, day, COUNT(*), AVG(tip) FROM tips GROUP BY smoker, day;
tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]}) #np.std https://docs.scipy.org/doc/numpy/reference/routines.statistics.html
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.aggregate.html
df.agg("mean", axis="columns")
0 2.0
1 5.0
2 8.0
3 NaN
# 使用DataFrame模糊筛选数据(类似SQL中的LIKE):
# https://pandas.pydata.org/pandas-docs/stable/api.html#string-handling
df[df['plan'].str.contains(r'.*?user.*')] #使用正则表达式进行模糊匹配,*匹配0或无限次,?匹配0或1次
# 使用DataFrame进行数据转换(后期补充说明)
df['plan'] = df_obj['plan'].str.replace('user(.{2,})info','\\1')#可以使用正则表达式
# apply 详细说明
# http://stackoverflow.com/questions/17841149/pandas-groupby-how-to-get-a-union-of-strings/17841294
# https://my.oschina.net/lionets/blog/280332
#相同的合并到一个数组
df_o = df.groupby('name')['a'].apply(lambda x: "{%s}" % ', '.join(str(x)))
df_a = df.groupby('name')['a'].apply(list)
df_b = df.groupby('name')['a'].unique().reset_index() #推荐
# 分段
pd.cut(df['age'],bins=['0,18,35,60,90'],labels=[...])
SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key;
pd.merge(df1, df2, on='key')
df_crc = pd.merge(df_crc, df_crc_list, how='left', left_on="app_crc", right_on="crc") # how='outer'
SELECT city, rank FROM df1 UNION ALL SELECT city, rank FROM df2;
pd.concat([df1, df2],ignore_index=True) #无索引才能做sum
'''
df1.append(df2):将df2中的行添加到df1的尾部
df.concat([df1, df2],axis=1):将df2中的列添加到df1的尾部
df1.join(df2,on=col1,how='inner'):对df1的列和df2的列执行SQL形式的join
'''
SELECT * FROM tips ORDER BY tip DESC LIMIT 10 OFFSET 5;
tips.nlargest(10+5, columns='tip').tail(10)
tips(10+5, columns='tip').sort_values('tip', ascending=False).tail(10)
# 例子 这个这样写不好
SELECT * FROM ( SELECT t.*, ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn FROM tips t ) WHERE rn < 3 ORDER BY day, rn;
(tips.assign(rn=tips.sort_values(['total_bill'], ascending=False).groupby(['day']).cumcount() + 1).query('rn < 3').sort_values(['day','rn']))
(tips.assign(rnk=tips.groupby(['day'])['total_bill'].rank(method='first', ascending=False)).query('rnk < 3').sort_values(['day','rnk']))
UPDATE tips SET tip = tip*2 WHERE tip < 2;
tips.loc[tips['tip'] < 2, 'tip'] *= 2
DELETE FROM tips WHERE tip > 9;
tips = tips.loc[tips['tip'] <= 9]
# 插入一列
df['a'] = 1
df.insert(1,'a', np.random.randn(6))
del df['a']
# 删数据
df = df.drop(['d','e'], axis=1)
df = df.drop(df[df['支行'] == '总计'].index) # axis=0 行
# https://pandas.pydata.org/pandas-docs/stable/api.html#groupby
# 和 reset_index 才能命名列名
df.groupby(['to'], as_index=False).sum().reset_index()
# 计数
df.groupby(['to'], as_index=False)['to'].value_counts()
df['count'] = df.groupby(['to'], as_index=False)['to'].transform('count')
df['size'] = df.groupby(['A','B']).transform(np.size)
# 分组计算
df.groupby(['id','blankid']).aggregate(np.sum)
'''
df[df[col] > 0.5]:选择col列的值大于0.5的行
df.sort_values(col1):按照列col1排序数据,默认升序排列
df.sort_values(col2, ascending=False):按照列col1降序排列数据
df.sort_values([col1,col2], ascending=[True,False]):先按列col1升序排列,后按col2降序排列数据
df.groupby(col):返回一个按列col进行分组的Groupby对象
df.groupby([col1,col2]):返回一个按多列进行分组的Groupby对象
df.groupby(col1)[col2]:返回按列col1进行分组后,列col2的均值
df.pivot_table(index=col1, values=[col2,col3], aggfunc=max):创建一个按列col1进行分组,并计算col2和col3的最大值的数据透视表
df.groupby(col1).agg(np.mean):返回按列col1分组的所有列的均值
data.apply(np.mean):对DataFrame中的每一列应用函数np.mean
data.apply(np.max,axis=1):对DataFrame中的每一行应用函数np.max
'''
# 各组数据量
df.size()
# 描述
df.describe()
'''
df.describe():查看数据值列的汇总统计
df.mean():返回所有列的均值
df.corr():返回列与列之间的相关系数
df.count():返回每一列中的非空值的个数
df.max():返回每一列的最大值
df.min():返回每一列的最小值
df.median():返回每一列的中位数
df.std():返回每一列的标准差
'''
# df列转series
df_keyword = self.df[['keyword']]
df_keyword =df_keyword.squeeze() # 一个值直接返回值数据类型
if isinstance(df_title, str):
keywordstr = str(df_title)
else:
tmp = df_title.tolist()
keywordstr = ','.join(tmp)
df_keyword.tolist()
# 缺失值处理
df.fillna('')
df.fillna(method='pad', limit=1)
# pad 前一个数据 bfill 后一个 limit 限制
df.fillna(df.mean) # 平均数
# 插值
df.interpolate()
# 针对index
df.interpolate(method='values') time
df.dropna() # axis=1 列
df.dropna(how='all') #所有列的值都为空 删除该行
# 找出重复
df.duplicated()
# 去重
df.drop_duplicates(subset=None, keep='first', inplace=False)
'''
subset : column label or sequence of labels, optional
用来指定特定的列,默认所有列
keep : {‘first’, ‘last’, False}, default ‘first’
删除重复项并保留第一次出现的项
inplace : boolean, default False
是直接在原来数据上修改还是保留一个副本
'''
pd.notnull #时候是null
# 替换
df = df.replace('', np.nan)
df1 = df.replace(np.nan,' ', regex=True) #可以有re
df.replace(1,2) #不常用 直接apply 更有用
'''
df.columns = ['a','b','c']:重命名列名
df.columns.values:获取列名
df.dtypes
pd.isnul():检查DataFrame对象中的空值,并返回一个Boolean数组
pd.notnull():检查DataFrame对象中的非空值,并返回一个Boolean数组
df.dropna():删除所有包含空值的行
df.dropna(axis=1):删除所有包含空值的列
df.dropna(axis=1,thresh=n):删除所有小于n个非空值的行
df.fillna(x):用x替换DataFrame对象中所有的空值
s.astype(float):将Series中的数据类型更改为float类型
s.replace(1,'one'):用‘one’代替所有等于1的值
s.replace([1,3],['one','three']):用'one'代替1,用'three'代替3
df.rename(columns=lambda x: x + 1):批量更改列名
df.rename(columns={'old_name': 'new_ name'}):选择性更改列名
df.set_index('column_one'):更改索引列
直接设置成日期
df['date'] = pd.to_datetime(df['date'])
df.set_index("date", inplace=True)
df.add_prefix('x_') :加前缀
df.add_suffix('_x') :加后缀
df.rename(index=lambda x: x + 1):批量重命名索引
'''
df.selec_dtypes(include='numbers')
# 列操作俩种方式,生成新列操作性强
dst_ip['addr'] = dst_ip['ip'].apply(lambda x: str(find(x)).split('\t'))
def oneIP(x):
if x['ip'] != '':
return x['ip']
elif x['inip'] != '':
return x['inip']
df_ip['ip'] = df_ip.apply(oneIP,axis=1)
# 直接产生list
df_a = df.groupby('name')['a'].apply(list)
def get_city(x):
return x
dstip['city'] = dstip['addr'].apply(get_city)
def sizes(s):
s['size_kb'] = locale.format("%.1f", s['size']/1024.0, grouping=True) + ' KB'
s['size_mb'] = locale.format("%.1f", s['size']/1024.0 ** 2, grouping=True) + ' MB'
s['size_gb'] = locale.format("%.1f", s['size']/1024.0 ** 3, grouping=True) + ' GB'
return s
return a, b, c
df_test = df_test.append(rows_list)
df_test = df_test.apply(sizes, axis=1)
df_test['size_kb'], df_test['size_mb'], df_test['size_gb'] = zip(*df_test['size'].apply(sizes))
df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
# 展现类型
df.style.format("{:.2%}")
df['c'] = ['a', 'b', 'c', 'd']
df.style.format({'c': str.upper})
# 广播 所有值操作
row = df.ix[1] # ix混合索引
df.sub(row,axis=1)
#求哑变量
dumiper=pd.get_dummies(df['key'])
df['key'].join(dumpier)
#透视表和交叉表 跟sas里面的proc freq步类似 R里面的aggrate和cast函数类似
pd.pivot_table()
pd.crosstab() #
# http://python.jobbole.com/81212/
# 将dataframe中,某列进行清洗的命令
删除换行符:misc['product_desc'] = misc['product_desc'].str.replace('\n', '')
删除字符串前后空格:df["Make"] = df["Make"].map(str.strip)
# 如果用模糊匹配的话,命令是:
rpt[rpt['STK_ID'].str.contains(r'^600[0-9]{3}/article>)]
#其他的一些技巧
df2[df2['A'].map(lambda x:x.startswith('61'))] #筛选出以61开头的数据
df = df['col'].map('{:,.2f}'.format) #直接做string 格式
# input:
A B
0 [1, 2] 10
1 [5, 6] -20
# output:
A B
0 1 10
1 2 10
2 5 -20
3 6 -20
# https://stackoverflow.com/questions/38428796/how-to-do-lateral-view-explode-in-pandas
# 这是垃圾方法
A = np.asarray(df.A.values.tolist())
B = np.stack([df.B for _ in xrange(A.shape[1])]).T
P = np.stack([A, B])
pd.Panel(P, items=['A', 'B']).to_frame().reset_index(drop=True)
df1 = df.A.apply(pd.Series).stack().rename('A')
df2 = df1.to_frame().reset_index(1, drop=True)
df2.join(df.B).reset_index(drop=True)
pd.DataFrame([[item]+list(df.loc[line,'B':]) for line in df.index for item in df.loc[line,'A']],
columns=df.columns)
# 这个推荐在很多场合
rows = []
for i, row in df.iterrows():
for a in row.A:
rows.append([a, row.B])
pd.DataFrame(rows, columns=df.columns)
# 0.25版本里可以 #https://pandas.pydata.org/pandas-docs/version/0.25/reference/api/pandas.DataFrame.explode.html
df.assign(A=df['A'].str.split(',')).explode('A')
df.assign(A=df['A']).explode('A')
df.explode('A')
# 推荐2
df_tmp = df['A'].apply(pd.Series) #分为多列
# 再合并起来
df = pd.concat([df,df_tmp],axis='cloumns')
for i, row in df.iterrows():
for a in row['ip']:
row = row.copy()
row['ipv4'] = a
rows.append(row)
df = pd.DataFrame(rows)
# https://stackoverflow.com/questions/32468402/how-to-explode-a-list-inside-a-dataframe-cell-into-separate-rows
df['A'].apply(pd.Series).stack().reset_index(level=2, drop=True).to_frame('A')
df_tmp = df_vms.ip.apply(pd.Series) #分为多列
# 再合并起来
df = pd.concat([df,df_tmp],axis='cloumns')
# categories与set_categories
# 给categories赋值,可以改变类别标签。赋值的时候是按照顺序进行对应的。a对应very good,b对应good,c对应very bad。操作完成之后,原来的标签a就变成了very good标签。
#
# 该列为数字
df['col'].apply(lambda x:x.isalnum())
df['col'].str.isdigit()
#
def df_rowspan(df, rowname):
for x in range(0,len(rowname)):
df['tmp_row'] = df[rowname[:x+1]].values.tolist()
tmp = df['tmp_row'] != df['tmp_row'].shift() #shift 下一列循环找
df['tmp_x'] = tmp.astype(int)
df['x_row'] = tmp.cumsum() #找到后计数 可以累计作为每一个值得index
df['x_c'] = df.groupby(['x_row']).cumcount(ascending=False) + 1
df[rowname[x] + '_rowspan'] = df['tmp_x'] * df['x_c']
df = df.drop(['tmp_row','tmp_x', 'x_row', 'x_c'], axis=1)
pass
return df
# pandas 方法中最简单的查找是
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
df.where(df['A']>=df2['B_low'])
df.where(df['A']>=df2['B_low'])
df.query('a>b')
df = pd.merge(df.assign(key=0), df.assign(key=0), on='key').drop('key', axis=1)
# 打印所有
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# https://zhuanlan.zhihu.com/p/32929522
# 根据要求进行meger
df1['conm'] = df1['conm'].map(lambda x: get_closest_match(x, df2['conm']))
# merge two database
result = pd.merge(df1,df2, on = ['conm','year']) #only matched observations
result_all = pd.merge(df1,df2, how = 'left', on = ['conm','year'],indicator = 'indicator')#Keep all the left(master file)
# 多级索引
# https://blog.csdn.net/Darkman_EX/article/details/80723802
# list split to row
rows = []
for i, row in df.iterrows():
for a in row['iplist']:
row = row.copy()
row['ip'] = a
rows.append(row)
df_vms_ip = pd.DataFrame(rows)
# 多级指标生成树状图
cols = ['一级指标', '二级指标', '三级指标','四级指标']
# for name in cols:
# colname_list = df_zb[name].drop_duplicates().values.tolist()
# cols_list.append(colname_list)
def tree(df,cols,keys):
ret = []
colname_list = df[cols[0]].drop_duplicates().values.tolist()
for colname in colname_list:
tmp = {}
tmp['name'] = colname
if len(cols)>1:
for key in keys:
tmp[key] = -1
tmp['children'] = tree(df[df[cols[0]]==colname],cols[1:],keys)
else:
for key in keys:
tmp[key] = df[df[cols[0]]==colname][key].values.tolist()[0]
ret.append(tmp)
return ret
pprint.pprint(tree(df_zb, cols, ['id']))
df.pipe(PrettyPandas).as_currency('GBP', subset='A').as_percent(subset='B').total().average()
# 前后差1
df_title['a'] = df_title['ind'].diff() == 1
# https://www.chzzz.club/post/167.html
get_dummies( ) #将分类变量编码为虚拟(二进制)变量
factorize #将分类变量编码为数字
import pandas_profiling