Python: Pandas
pip install xlrd 可以用来读取excel
import pandas as pd
import datetime
import matplotlib
import matplotlib.pyplot as plt
import os #os 适用于windows的路径,包括路径拼接等
cur_dir = os.path.dirname(__file__)
# __file__ 为当前.py file 的位置
df = pd.read_csv(os.path.join(cur_dir,"employees.csv"))
#print(cur_dir)
#print(df)
#
# cur_dir is C:/Users/T596494/FZPython/Insideout2_dist
# os.path.join(cur_dir,"employees.csv") is C:/Users/T596494/FZPython/Insideout2_dist\employees.csv
# it can automatically add \
# when print df, it will print the file details
# df will show up the whole document's details - "employees.csv"
print(df.shape)
# shape will show the number of rows and columns of the data. - (1000, 4) 1000 rows, 4 columns 不含title的数据行数
# print(df.dtypes)
# dtypes check each columns' data type
# employee_id int64
# annual_inc float64
# employee_title object
# home_ownership object
# dtype: object
print(df.columns)
# Index(['employee_id', 'annual_inc', 'employee_title', 'home_ownership'], dtype='object')
# show each columns' title
print(df.head())
#df.head() 可以显示前面几行的数据,如果在括号里面输入3,print(df.head(3)) 则显示三行数据(包括title则为4行)
print(df.sample(10))
#随机显示10行数据,若不写,则只显示1行
print(df.tail(5))
#显示最后5行的数据
#把title为annual_inc的列储存到annual_incs的变量中。annual_incs的属性同df
#Thisis known as a Series.
annual_incs = df['annual_inc']
print(annual_incs)
print(type(annual_incs))
print(annual_incs.head())
print(annual_incs.shape)
#将第二行的数据存到row_2,并且将相应数据显示出来
row_2 = df.iloc[1]
print(row_2)
emp_id = row_2['employee_id']
emp_title = row_2['employee_title']
print('The employee with ID {emp_id} has the job title {emp_title}.' .format(emp_id = emp_id,emp_title = emp_title))
#筛选
Filtering by ONE condition, numeric
To perform filtering by numeric values, first check the column is a numeric column (int64 or float64).
Then use the following formula to do so:
df[df['column_name']<conditional operator><value>]
#筛选 df中年收入大于 300000的所有行
print(df[df['annual_inc'] >=300000])
#对字符串类型的数据的筛选
#选出所有home_ownership 是OWN的数据
print(df[df['home_ownership'] =='OWN'].head())
#选出所有employee_title 是‘Accountant'或者'Sales'的数据
print(df[df['employee_title'].isin(['Accountant','Sales'])])
#多条件筛选
#选出employee_title 是President, 并且annual_inc 超过225000的数据
print(df[ (df['employee_title'] =='President') & (df['annual_inc'] >=225000)])
#选出employee_title是Sales, 并且annual_inc 超过100000 或者小于 80000
print(df[(df['employee_title']=='Sales') & ((df['annual_inc']>=100000) | (df['annual_inc']<=80000))])
#保存,用df的id对df2的id做vlookup,并且保存到output.csv
df2 = pd.read_csv(os.path.join(cur_dir,"employees2.csv"))
output = df[df['employee_id'].isin(df2['employee_id'])]
output.to_csv("output.csv",index=False)
#显示文件的信息
print(df.info())
#Draw graph with matplotlib
df.groupby('home_ownership')['home_ownership'].value_counts().unstack().plot(kind='bar',figsize=(10,5))
plt.show()