辽经干3
2021-04-07 本文已影响0人
__method__
set and string
#set {}
# dict {:, :}
# tuple ()
# list []
# 无序不重复
a = {"1", "2", "3", 4, 4, 4}
print(a)
print(type(a))
nameset = ["张三", "李四", "李四", "李四", "李四"]
print(len(nameset))
print(nameset)
print(set(nameset))
print(len(set(nameset)))
# print(a[0]) # TypeError: 'set' object is not subscriptable
# string
#用 ‘’ “” 中包含的都是string类型
# 字符串一旦创建具有不可变行
name = "$liao, ning jingji"
print(len(name))
print(name.split())
print(name.replace(",", ";"))
print(name)
name = name.replace(",", ";")
print(name)
# 把列表变成字符串 join
ls = ["I", "am", "a", "good", "student"]
str1 = "-".join(ls)
print(str1)
str2 = " ".join(ls)
print(str2)
file io
# 文件的读取与写入
# 写入
# , mode='w'代表写入文件
s = "辽宁省沈阳市沈北新区"
f = open('hello.txt', mode='w', encoding='utf-8')
f.write(s)
# mode='r'是读取文件
f = open('hello.txt', mode='r', encoding='utf-8')
data = f.read()
print(data)
jieba
# 中文分词技术
# 直接使用 jieba 分词就可以
# pip install jieba
import jieba
seg = "我来自北京清华大学"
print(jieba.lcut(seg))
sort
counts = {'第一回': 1, '桃园': 19, '豪杰': 22, '结义': 14, '黄巾': 40,}
# 排序
# 字典 ---》 列表
ls = list(counts.items())
# 按照 元组的第二值进行排序
ls.sort(key= lambda item :item[1], reverse=True)
print(ls)
analysis
import jieba
f = open('threekingdom.txt', mode='r', encoding='utf-8')
data = f.read()
print(len(data))
word_list = jieba.lcut(data)
print(len(word_list))
# 词频统计
counts = {}
for word in word_list:
if len(word) <= 1:
continue
else:
counts[word]=counts.get(word, 0) + 1
counts["孔明"] = counts["孔明"] + counts["孔明曰"]
counts["关公"] = counts["关公"] + counts["云长"]
counts["刘备"] = counts["刘备"] + counts["玄德曰"]+ counts["玄德"]
stop_words = {"将军", "却说", "丞相", "孔明曰", "二人", "不可","荆州","不能",
"如此", "商议", "如何","主公", "军士", "军马", "左右", "玄德曰",
"玄德", "云长", "次日", "引兵", "大喜", "东吴", "于是", "今日"
, "不敢", "天下", "魏兵", "陛下", "都督"}
for word in stop_words:
del counts[word]
ls = list(counts.items())
ls.sort(key= lambda item :item[1], reverse=True)
for name, num in ls[:10]:
print(name, num)