Apriori
2020-09-22 本文已影响0人
Jasmine晴天和我
def loadDataSet():
dataset = [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]
return dataset
#dataSet = loadDataSet()
def createC1(dataSet):
'''
生成第一个候选集合C1
参数:
dataset:原始数据集
返回:
frozenset形式的候选集合C1
'''
C1 = []
for transaction in dataSet:
for item in transaction:
if not {item} in C1:
C1.append({item})
C1.sort()
return list(map(frozenset,C1))
#print(C1)
#计算支持度,并保留支持度大于最小支持度的项集
def scanD(D,Ck,minSupport):
'''
生成满足最小支持度的频繁项集L1
参数:
D:原始数据集
Ck:候选项集
minsupport:最小支持度
返回:
retlist:频繁项集
supportdata:候选项集的支持度
'''
ssCnt = {}
for tid in D:
for can in Ck:
if can.issubset(tid): #判断can是否是tid的子集
if can not in ssCnt.keys():
ssCnt[can]=1
else:
ssCnt[can]+=1
numItems = float(len(D))
retList = [] # 频繁项集
supportData = {} #候选项集Ck的支持度字典(key:候选项,value:支持度)
for key in ssCnt:
support = ssCnt[key]/numItems #支持度
supportData[key] = support
if support >= minSupport:
retList.append(key)
return retList,supportData
#print("这是L1",L1,supportData)
'''
当集合中项的个数大于0时:
构建一个k项集组成的列表
检查数据确保每个项集都是频繁的
保留频繁项集并构建(k+1)项集组成列表
'''
#项集扩展
def aprioriGen(Lk,k):
Ck = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i+1,lenLk):
#前k-2个项相同时,将两个集合合并
L1 = list(Lk[i])[:k-2]
L1.sort()
L2 = list(Lk[j])[:k-2]
L2.sort()
if L1==L2:
Ck.append(Lk[i]|Lk[j])
return Ck
def apriori(D,minSupport=0.5):
C1 = createC1(D)
L1,supportData = scanD(D,C1,minSupport)
L = [L1]
k = 2
while (len(L[k-2])>0):
Ck = aprioriGen(L[k-2],k)
Lk,supK = scanD(D,Ck,minSupport)
supportData.update(supK)
L.append(Lk)
k+=1
return L,supportData
#print(L,supportData)
def generate_big_rules(L, supportData, min_conf):
"""
从频繁项集产生关联规则.
参数:
L: 候选项集列表
supportData:频繁项集及其支持度
返回:
big_rule_list: 关联规则
"""
big_rule_list = []
sub_set_list = []
for i in range(0, len(L)):
for freq_set in L[i]: #遍历第i个频繁项集列表
for sub_set in sub_set_list:
if sub_set.issubset(freq_set):
conf = supportData[freq_set] / supportData[freq_set - sub_set] #置信度等于支持度相除
big_rule = (freq_set - sub_set, sub_set, conf)
if conf >= min_conf and big_rule not in big_rule_list:
# print freq_set-sub_set, " => ", sub_set, "conf: ", conf
big_rule_list.append(big_rule)
sub_set_list.append(freq_set)
return big_rule_list
# big_rules_list = generate_big_rules(L,supportData,min_conf=0.4)
# print(big_rules_list)
if __name__ == "__main__":
"""
Test
"""
dataSet = loadDataSet()
C1 = createC1(dataSet)
L,supportData = apriori(dataSet,minSupport=0.5)
#L1,supportData = scanD(dataSet,C1,0.5)
big_rules_list = generate_big_rules(L, supportData, min_conf=0.4)
for Lk in L:
if len(list(Lk)) == 0:
break
print("="*50)
print("frequent " + str(len(list(Lk)[0])) + "-itemsets\t\tsupport")
print("="*50)
for freq_set in Lk:
print(freq_set, supportData[freq_set])
print()
print("Big Rules")
for item in big_rules_list:
print(item[0], "=>", item[1], "conf: ", item[2])