# 数据挖掘 | 利用python进行商品亲和性分析

Python 数据 利用 进行 挖掘

## 1. 什么是亲和性分析

`支持度` 是规则在数据集中出现的次数，即匹配规则的样本数，比如同时购买商品X和Y的交易数；

`置信度` 是衡量匹配规则的准确度的，比如在购买商品X的交易中同时购买商品Y的比例。

## 2. 案例详解

• `numpy`
• `collections`

### 2.1. 加载数据集

``````import numpy as np
dataset_filename = "affinity_dataset.txt"
n_samples, n_features = X.shape
print(f"这份数据集 有 {
n_samples} 行 和 {
n_features} 列")
print(X[:5])
``````
``````这份数据集 有 100 行 和 5 列
[[0. 1. 0. 0. 0.]
[1. 1. 0. 0. 0.]
[0. 0. 1. 0. 1.]
[1. 1. 0. 0. 0.]
[0. 0. 1. 1. 1.]]
``````

``````features = ["面包", "牛奶", "奶酪", "苹果", "香蕉"]
``````

### 2.2. 求苹果->香蕉的亲和性

``````num_apple_purchases = 0
for sample in X:
if sample[3] == 1: # 第4列
num_apple_purchases += 1
print(f"含苹果的交易数为：{
num_apple_purchases}")
``````
``````含苹果的交易数为：43
``````

``````rule_valid = 0
for sample in X:
if sample[3] == 1: # 购买了苹果
if sample[4] == 1:
# 同时也购买了香蕉
rule_valid += 1
else:
continue
print(f"同时购买苹果和香蕉的交易数为：{
rule_valid}")
``````
``````同时购买苹果和香蕉的交易数为：27
``````

``````support = rule_valid # 支持度
confidence = rule_valid / num_apple_purchases # 置信度
print(f"苹果-香蕉的支持度为 {
support}\n苹果—>香蕉的置信度为 {
confidence:.3f}")
# Confidence can be thought of as a percentage using the following:
print(f"苹果—>香蕉的置信度百分比为 {
confidence*100:.1f}%.")
``````
``````苹果-香蕉的支持度为 27

``````

### 2.3. 亲和性分析

``````from collections import defaultdict
# 初始化2个字典
valid_rules = defaultdict(int)
num_occurences = defaultdict(int)
# 计算支持度及X->Y中X出现次数
for sample in X:
for premise in range(n_features):
if sample[premise] == 0:
continue
# 记录X—>Y中X出现的次数
num_occurences[premise] += 1
for conclusion in range(n_features):
if premise == conclusion: # X—>X是无意义的，跳过
continue
if sample[conclusion] == 1:
# X和Y同时出现，则匹配规则的次数+1
valid_rules[(premise, conclusion)] += 1
else:
continue
support = valid_rules
confidence = defaultdict(float)
# 计算置信度
for premise, conclusion in valid_rules.keys():
confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]
#输出结果
for premise, conclusion in confidence:
premise_name = features[premise]
conclusion_name = features[conclusion]
print(f"规则: {
premise_name}—>{
conclusion_name}")
print(f" - 置信度: {
confidence[(premise, conclusion)]*100:.2f}%")
print(f" - 支持度: {
support[(premise, conclusion)]}")
print('-'*20)
``````

``````规则: 面包—>牛奶
- 置信度: 46.43%
- 支持度: 13
--------------------

- 置信度: 25.00%
- 支持度: 13
--------------------

- 置信度: 51.28%
- 支持度: 20
--------------------

- 置信度: 35.09%
- 支持度: 20
--------------------

- 置信度: 56.41%
- 支持度: 22
--------------------

- 置信度: 51.16%
- 支持度: 22
--------------------

- 置信度: 62.79%
- 支持度: 27
--------------------

- 置信度: 47.37%
- 支持度: 27
--------------------

- 置信度: 34.62%
- 支持度: 18
--------------------

- 置信度: 41.86%
- 支持度: 18
--------------------

- 置信度: 51.92%
- 支持度: 27
--------------------

- 置信度: 47.37%
- 支持度: 27
--------------------

- 置信度: 17.86%
- 支持度: 5
--------------------

- 置信度: 12.82%
- 支持度: 5
--------------------

- 置信度: 57.14%
- 支持度: 16
--------------------

- 置信度: 28.07%
- 支持度: 16
--------------------

- 置信度: 21.15%
- 支持度: 11
--------------------

- 置信度: 28.21%
- 支持度: 11
--------------------

- 置信度: 32.14%
- 支持度: 9
--------------------

- 置信度: 20.93%
- 支持度: 9
--------------------
``````

``````def print_rule(premise, conclusion, support, confidence, features):
premise_name = features[premise]
conclusion_name = features[conclusion]
print(f"规则: {
premise_name}—>{
conclusion_name}")
print(f" - 置信度: {
confidence[(premise, conclusion)]*100:.2f}%")
print(f" - 支持度: {
support[(premise, conclusion)]}")
print('-'*20)
from operator import itemgetter
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5):
print("Rule #{0}".format(index + 1))
(premise, conclusion) = sorted_confidence[index][0]
print_rule(premise, conclusion, support, confidence, features)
``````

``````Rule #1

- 置信度: 62.79%
- 支持度: 27
--------------------
Rule #2

- 置信度: 57.14%
- 支持度: 16
--------------------
Rule #3

- 置信度: 56.41%
- 支持度: 22
--------------------
Rule #4

- 置信度: 51.92%
- 支持度: 27
--------------------
Rule #5

- 置信度: 51.28%
- 支持度: 20
--------------------
``````

## 3. 算法优化

``````import pandas as pd
# 将数据集转化为Dataframe类型
df = pd.DataFrame(X, columns=features)
# 先引入该内置标准库
import itertools
# 求组合（顺序不同组合不同）
it = itertools.permutations(features,2)
rules = list(it)
for rule in rules:
num_occurence = df[rule[0]].sum() # X—>Y中X在整个数据集中出现次数
num_rule = df.query(f"{
rule[0]}+{
rule[1]}==2").shape[0] # 同时购买X和Y的交易次数（支持度），这里用两数相加=2来验证
# df.query(f"{rule[0]}*{rule[1]}==1").shape[0] # 用两数相乘=1 验证
# df[(df[rule[0]]==1) & (df[rule[1]]==1)].shape[0] # 直接用对应值都是1来验证
# df[df[rule[0]]==1][rule[1]].sum() # 用X—>Y中，X购买情况下Y列求和（毕竟购买为1，不购买为0）
# df[list(rule)].all(axis = 'columns').sum() # 由于数据为0和1，适用于all()方法进行bool判断，然后sum求和
confidence = num_rule / num_occurence # 置信度
print(f'规则：{
rule[0]}—>{
rule[1]}\n- 置信度：{
confidence*100:.2f}%\n- 支持度：{
num_rule}\n')
``````

``````规则：面包—>牛奶
- 置信度：46.43%
- 支持度：13

- 置信度：17.86%
- 支持度：5

- 置信度：32.14%
- 支持度：9

- 置信度：57.14%
- 支持度：16

- 置信度：25.00%
- 支持度：13
``````

``````columns=['规则','置信度','支持度']
data = pd.DataFrame(columns=columns)
for rule in rules:
num_occurence = df[rule[0]].sum() # X—>Y中X在整个数据集中出现次数
num_rule = df[df[rule[0]]==1][rule[1]].sum() # 用X—>Y中，X购买情况下Y列求和（毕竟购买为1，不购买为0）
confidence = num_rule / num_occurence # 置信度
data = data.append(pd.DataFrame([[f'{
rule[0]}—>{
rule[1]}',confidence,num_rule]],columns=columns),ignore_index=True)
data.nlargest(n=5, columns= '置信度', keep='first')
``````