Aller au contenu

Association Rules

Packages

import mlxtend
print(mlxtend.__version__)
0.22.0

import pandas as pd

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

Get data

DATASET_PATH = "Data/Sample - Superstore.xls"
data = pd.read_excel(DATASET_PATH, sheet_name="Orders")
print(data.head().T)
                                               0  ...                               4
Row ID                                         1  ...                               5
Order ID                          CA-2016-152156  ...                  US-2015-108966
Order Date                   2016-11-08 00:00:00  ...             2015-10-11 00:00:00
Ship Date                    2016-11-11 00:00:00  ...             2015-10-18 00:00:00
Ship Mode                           Second Class  ...                  Standard Class
Customer ID                             CG-12520  ...                        SO-20335
Customer Name                        Claire Gute  ...                  Sean O'Donnell
Segment                                 Consumer  ...                        Consumer
Country                            United States  ...                   United States
City                                   Henderson  ...                 Fort Lauderdale
State                                   Kentucky  ...                         Florida
Postal Code                                42420  ...                           33311
Region                                     South  ...                           South
Product ID                       FUR-BO-10001798  ...                 OFF-ST-10000760
Category                               Furniture  ...                 Office Supplies
Sub-Category                           Bookcases  ...                         Storage
Product Name   Bush Somerset Collection Bookcase  ...  Eldon Fold 'N Roll Cart System
Sales                                     261.96  ...                          22.368
Quantity                                       2  ...                               2
Discount                                     0.0  ...                             0.2
Profit                                   41.9136  ...                          2.5164

[21 rows x 5 columns]

print(data.shape)
(9994, 21)

Data Exploration

print(data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Row ID         9994 non-null   int64         
 1   Order ID       9994 non-null   object        
 2   Order Date     9994 non-null   datetime64[ns]
 3   Ship Date      9994 non-null   datetime64[ns]
 4   Ship Mode      9994 non-null   object        
 5   Customer ID    9994 non-null   object        
 6   Customer Name  9994 non-null   object        
 7   Segment        9994 non-null   object        
 8   Country        9994 non-null   object        
 9   City           9994 non-null   object        
 10  State          9994 non-null   object        
 11  Postal Code    9994 non-null   int64         
 12  Region         9994 non-null   object        
 13  Product ID     9994 non-null   object        
 14  Category       9994 non-null   object        
 15  Sub-Category   9994 non-null   object        
 16  Product Name   9994 non-null   object        
 17  Sales          9994 non-null   float64       
 18  Quantity       9994 non-null   int64         
 19  Discount       9994 non-null   float64       
 20  Profit         9994 non-null   float64       
dtypes: datetime64[ns](2), float64(3), int64(3), object(13)
memory usage: 1.6+ MB
None

data = data[["Customer ID", "Sub-Category"]]
print(data.sample(10))
     Customer ID Sub-Category
6496    KH-16510  Accessories
484     MT-18070       Labels
855     BK-11260        Paper
227     DS-13180      Storage
6546    DP-13000        Paper
822     TR-21325  Accessories
6610    AG-10495      Binders
2434    PJ-18835    Envelopes
6121    JR-16210    Bookcases
5197    AI-10855       Phones

print(data.describe())
       Customer ID Sub-Category
count         9994         9994
unique         793           17
top       WB-21850      Binders
freq            37         1523

Preprocessing

flatten = lambda cart: list(cart)

carts = data.groupby(by="Customer ID").agg({"Sub-Category": flatten}).reset_index().rename(columns={"Sub-Category": "Chart"})
print(carts.head(10))
  Customer ID                                              Chart
0    AA-10315  [Appliances, Binders, Storage, Binders, Applia...
1    AA-10375  [Storage, Furnishings, Accessories, Binders, A...
2    AA-10480  [Paper, Furnishings, Paper, Storage, Paper, Pa...
3    AA-10645  [Chairs, Phones, Chairs, Furnishings, Envelope...
4    AB-10015  [Chairs, Art, Storage, Storage, Phones, Bookca...
5    AB-10060  [Accessories, Paper, Binders, Paper, Binders, ...
6    AB-10105  [Tables, Furnishings, Binders, Phones, Labels,...
7    AB-10150  [Accessories, Paper, Supplies, Art, Furnishing...
8    AB-10165  [Accessories, Paper, Art, Art, Paper, Binders,...
9    AB-10255  [Phones, Storage, Accessories, Supplies, Paper...

encoder = TransactionEncoder()
encoder.fit(carts["Chart"])
chart_array = encoder.transform(carts["Chart"])
df_chart = pd.DataFrame(chart_array, columns=encoder.columns_, dtype=int)
print(df_chart.head(10))
   Accessories  Appliances  Art  Binders  Bookcases  Chairs  ...  Machines  Paper  Phones  Storage  Supplies  Tables
0            1           1    0        1          0       0  ...         0      1       1        1         1       0
1            1           0    1        1          0       0  ...         0      1       1        1         0       0
2            1           0    1        0          0       0  ...         0      1       1        1         0       1
3            0           0    1        1          1       1  ...         0      1       1        1         0       0
4            0           0    1        0          1       1  ...         0      0       1        1         0       0
5            1           1    0        1          0       1  ...         0      1       0        0         1       1
6            1           0    1        1          0       0  ...         1      0       1        1         0       1
7            1           0    1        1          0       0  ...         0      1       0        0         1       0
8            1           0    1        1          0       1  ...         0      1       0        1         0       0
9            1           0    1        1          0       0  ...         0      1       1        1         1       0

[10 rows x 17 columns]

Itemsets

frequent_itemsets = apriori(df_chart, min_support=0.45, use_colnames=True)
print(frequent_itemsets)
     support                       itemsets
0   0.597730                  (Accessories)
1   0.622951                          (Art)
2   0.819672                      (Binders)
3   0.513241                       (Chairs)
4   0.665826                  (Furnishings)
5   0.770492                        (Paper)
6   0.644388                       (Phones)
7   0.648172                      (Storage)
8   0.508197         (Accessories, Binders)
9   0.491803           (Paper, Accessories)
10  0.523329                 (Art, Binders)
11  0.493064                   (Paper, Art)
12  0.553594         (Furnishings, Binders)
13  0.650694               (Paper, Binders)
14  0.532156              (Binders, Phones)
15  0.546028             (Storage, Binders)
16  0.535939           (Paper, Furnishings)
17  0.450189          (Furnishings, Phones)
18  0.520807                (Paper, Phones)
19  0.535939               (Paper, Storage)
20  0.451450  (Paper, Furnishings, Binders)
21  0.460277      (Paper, Storage, Binders)

Association Rules

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
rules = rules[["antecedents", "consequents", "support", "confidence", "lift", "leverage"]]
# rules
print(rules)
               antecedents       consequents   support  confidence      lift  leverage
0            (Accessories)         (Binders)  0.508197    0.850211  1.037257  0.018254
1            (Accessories)           (Paper)  0.491803    0.822785  1.067870  0.031257
2                    (Art)         (Binders)  0.523329    0.840081  1.024899  0.012714
3                    (Art)           (Paper)  0.493064    0.791498  1.027263  0.013086
4            (Furnishings)         (Binders)  0.553594    0.831439  1.014356  0.007835
5                  (Paper)         (Binders)  0.650694    0.844517  1.030311  0.019143
6                (Binders)           (Paper)  0.650694    0.793846  1.030311  0.019143
7                 (Phones)         (Binders)  0.532156    0.825832  1.007515  0.003969
8                (Storage)         (Binders)  0.546028    0.842412  1.027743  0.014740
9            (Furnishings)           (Paper)  0.535939    0.804924  1.044689  0.022926
10                (Phones)           (Paper)  0.520807    0.808219  1.048965  0.024311
11               (Storage)           (Paper)  0.535939    0.826848  1.073143  0.036529
12    (Paper, Furnishings)         (Binders)  0.451450    0.842353  1.027671  0.012156
13  (Furnishings, Binders)           (Paper)  0.451450    0.815490  1.058402  0.024911
14        (Paper, Storage)         (Binders)  0.460277    0.858824  1.047765  0.020983
15        (Paper, Binders)         (Storage)  0.460277    0.707364  1.091323  0.038516
16      (Storage, Binders)           (Paper)  0.460277    0.842956  1.094049  0.039568
17               (Storage)  (Paper, Binders)  0.460277    0.710117  1.091323  0.038516