关联规则产生(基于直接遍历方法)

  • 发布日期:2019-10-29
  • 难度:简单
  • 类别:关联规则挖掘
  • 标签:Python、购物篮、关联规则、遍历

1. 问题描述

自编Python程序通过基于直接遍历方式实现关联规则产生。购物篮数据来自IBM SPSS Modeler软件自带的BASKETS1n数据集。

注:需要先运行频繁项集挖掘代码块,再运行关联规则产生代码块。

2. 程序实现

In [5]:
#----------以下是apriori.py进行频繁项集挖掘的代码-------
def create_C1(data_set):
    C1 = set()
    for t in data_set:
        for item in t:
            item_set = frozenset([item])
            #将频繁1-项集中的每一项都转换为不可变集合
            C1.add(item_set)
    return C1



def create_Ck(Lksub1, k):    #Lksub1即为L(k-1)
    Ck = set()
    len_Lksub1 = len(Lksub1)
    list_Lksub1 = list(Lksub1)
    #利用Fk-1*Fk-1算法生成候选项集
    for i in range(len_Lksub1):
        for j in range(1, len_Lksub1):
            if(i<j):
                l1 = list(list_Lksub1[i])
                l2 = list(list_Lksub1[j])
                l1.sort()
                l2.sort()
                if l1[0:k-2] == l2[0:k-2]:
                    Ck_item = list_Lksub1[i] | list_Lksub1[j]
                    # 子集测试
                    sub_test=1
                    for item in Ck_item:
                        sub_Ck = Ck_item - frozenset([item])
                        if sub_Ck not in Lksub1:
                            sub_test=0
                            break
                    if sub_test==1:
                        Ck.add(Ck_item)
    return Ck

def generate_Lk_by_Ck(data_set, Ck, min_support,support_data):
    Lk = set()
    item_count = {}
    tran_num = float(len(data_set))
    #对候选k-项集出现的次数进行计数
    for t in data_set:
        for item in Ck:
            if item.issubset(t):
                if item not in item_count:
                    item_count[item] = 1
                else:
                    item_count[item] += 1
    #通过最小支持度筛选得到频繁k-项集
    for item in item_count:
        if (item_count[item] / tran_num) >= min_support:
            Lk.add(item)
            support_data[item] = item_count[item] / tran_num
    return Lk


def generate_L(data_set, min_support):
    support_data = {}
    L=[]
    C1 = create_C1(data_set)
    L1 = generate_Lk_by_Ck(data_set, C1, min_support, support_data)
    L.append(L1)
    Lksub1 = L1.copy()
    i=2
    #从k=2开始设定一个循环,直到候选k-项集或频繁k-项集为空时结束
    while 1:
        Ci = create_Ck(Lksub1, i)
        Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data)
        if len(Li)>0:
            L.append(Li)
        if len(Ci)==0 or len(Li)==0:
            break
        Lksub1 = Li.copy()
        i+=1
    return L,support_data


import xlrd  #excel文件读取函数
#定义数据集读取函数
def load_data_set():
    data_set=[]
    data = xlrd.open_workbook('Basket1n.xls')
    table = data.sheets()[0]   #索引到第一个工作表
    card_id = table.col_values(0)[1:]
    for i in range(1,len(card_id)+1):
        one_tran=[]
        for j in range(7,18):  #商品对应的列数
            if table.cell(i,j).value=='T':
                one_tran.append(table.cell(0,j).value)
        data_set.append(one_tran)
    return data_set
#读取数据集
data_set = load_data_set()
#通过事务数据集和最小支持度挖掘频繁项集
L,support_data = generate_L(data_set, min_support=0.1)
#输出频繁项集挖掘结果
print('频繁项集'+'\t'+'支持度')
for i in support_data:
    print(str(i)+':'+str(support_data[i]))
#共得到17个频繁项集
#----------以上是apriori.py进行频繁项集挖掘的代码-------
频繁项集	支持度
frozenset({'新鲜肉类'}):0.183
frozenset({'牛奶'}):0.177
frozenset({'糖果'}):0.276
frozenset({'鱼'}):0.292
frozenset({'冷冻食品'}):0.302
frozenset({'灌装蔬菜'}):0.303
frozenset({'啤酒'}):0.293
frozenset({'白酒'}):0.287
frozenset({'水果蔬菜'}):0.299
frozenset({'汽水'}):0.184
frozenset({'灌装肉品'}):0.204
frozenset({'灌装蔬菜', '啤酒'}):0.167
frozenset({'灌装蔬菜', '冷冻食品'}):0.173
frozenset({'冷冻食品', '啤酒'}):0.17
frozenset({'水果蔬菜', '鱼'}):0.145
frozenset({'糖果', '白酒'}):0.144
frozenset({'灌装蔬菜', '冷冻食品', '啤酒'}):0.146
In [6]:
def generateRules(L, support_data, min_conf):
    rulelist = []
    sub_set_list = []
    for i in range(0, len(L)):
        for freq_set in L[i]:
            for sub_set in sub_set_list:
                if sub_set.issubset(freq_set):
                    conf = support_data[freq_set] / support_data[freq_set - sub_set]
                    a_rule = (freq_set - sub_set, sub_set, conf)
                    if conf >= min_conf and a_rule not in rulelist:
                        rulelist.append(a_rule)
            sub_set_list.append(freq_set)
    return rulelist

rules = generateRules(L, support_data, min_conf=0.5)
print('前项=>后项\t置信度')
for rule in rules:
    print(str(rule[0])+'=>'+str(rule[1])+'\t'+str(round(rule[2],3)))
#通过运行程序,共得到11条关联规则
前项=>后项	置信度
frozenset({'灌装蔬菜'})=>frozenset({'冷冻食品'})	0.571
frozenset({'冷冻食品'})=>frozenset({'灌装蔬菜'})	0.573
frozenset({'糖果'})=>frozenset({'白酒'})	0.522
frozenset({'白酒'})=>frozenset({'糖果'})	0.502
frozenset({'啤酒'})=>frozenset({'冷冻食品'})	0.58
frozenset({'冷冻食品'})=>frozenset({'啤酒'})	0.563
frozenset({'啤酒'})=>frozenset({'灌装蔬菜'})	0.57
frozenset({'灌装蔬菜'})=>frozenset({'啤酒'})	0.551
frozenset({'灌装蔬菜', '啤酒'})=>frozenset({'冷冻食品'})	0.874
frozenset({'冷冻食品', '啤酒'})=>frozenset({'灌装蔬菜'})	0.859
frozenset({'冷冻食品', '灌装蔬菜'})=>frozenset({'啤酒'})	0.844
In [ ]: