Step 1: Load data

# read in the data
setwd("~/Dropbox/Applied Data Science (IST 687)/Datasets")
  load("termDocMatrix.rdata")
  # store the data in a new object called "Data"
  Data <- termDocMatrix
  # check the first 5 rows of the matrix
  head(Data,5)
##               Docs
## Terms          1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
##   analysis     0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  1  0  1  1  1  1  0  0
##   applications 0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   code         0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   computing    0 0 1 1 0 1 1 1 1  1  0  1  0  0  0  0  0  0  0  0  0  0  0
##   data         1 1 0 0 2 0 0 0 0  0  1  2  1  1  1  0  1  0  0  0  0  0  0
##               Docs
## Terms          24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
##   analysis      1  0  0  1  1  1  1  0  0  0  0  0  0  0  1  0  0  0  0  0
##   applications  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  1  0
##   code          0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   computing     0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0
##   data          0  0  0  0  1  0  0  0  1  0  0  1  1  0  0  0  0  0  1  0
##               Docs
## Terms          44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
##   analysis      0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   applications  0  0  1  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0
##   code          0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   computing     0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0
##   data          0  1  1  1  0  0  1  0  0  0  1  1  0  0  1  1  0  1  0  1
##               Docs
## Terms          64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
##   analysis      0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  1  0  0  0
##   applications  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1
##   code          0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0
##   computing     0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   data          0  1  1  0  0  0  1  0  0  0  1  0  0  0  2  0  0  0  0  1
##               Docs
## Terms          84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
##   analysis      0  0  0  0  0  0  0  0  0  0  1  0  0  1  0  0   0   1   0
##   applications  0  0  0  0  1  1  0  0  0  0  0  0  0  0  0  0   0   0   0
##   code          0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0   0   0   0
##   computing     0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0   0   0   0
##   data          0  0  0  0  1  0  1  0  0  1  0  0  0  1  0  0   0   0   0
##               Docs
## Terms          103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
##   analysis       0   0   0   0   0   1   0   0   0   0   0   0   0   0   0
##   applications   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   code           0   0   1   0   0   1   0   0   0   1   0   0   0   0   0
##   computing      0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   data           0   3   0   1   0   0   1   0   0   0   0   1   0   0   1
##               Docs
## Terms          118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
##   analysis       0   0   0   1   0   0   0   0   1   1   1   0   0   0   0
##   applications   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   code           0   1   0   0   0   0   0   0   0   0   0   1   1   0   0
##   computing      0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   data           0   1   0   1   0   1   1   0   0   0   0   0   0   0   0
##               Docs
## Terms          133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
##   analysis       0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   applications   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   code           0   0   0   1   0   0   0   0   0   0   0   0   0   0   0
##   computing      0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   data           1   0   0   0   0   1   2   0   0   0   2   0   2   1   0
##               Docs
## Terms          148 149 150 151 152 153 154
##   analysis       0   0   0   0   0   1   0
##   applications   0   0   0   0   0   0   1
##   code           0   0   0   0   0   0   0
##   computing      0   0   0   0   0   0   0
##   data           0   2   0   1   1   0   2
  Data[Data>1] <- 1

Step 2: Transpose the matrix

  df <- t(Data)
  # check the first 5 rows to make sure the transpose was done correctly
  head(df,5)
##     Terms
## Docs analysis applications code computing data examples introduction
##    1        0            0    0         0    1        0            0
##    2        0            0    0         0    1        0            0
##    3        0            0    0         1    0        0            0
##    4        0            0    0         1    0        0            0
##    5        0            0    0         0    1        0            0
##     Terms
## Docs mining network package parallel positions postdoctoral r research
##    1      0       0       0        0         1            0 0        0
##    2      0       0       0        0         1            0 0        1
##    3      0       0       0        1         0            0 1        0
##    4      0       0       1        1         0            0 1        0
##    5      0       0       1        0         0            0 1        0
##     Terms
## Docs series slides social time tutorial users
##    1      0      0      0    0        0     0
##    2      0      0      0    0        0     0
##    3      0      0      0    0        0     0
##    4      0      0      0    0        1     1
##    5      0      0      0    0        0     1
  #Forces items to belong to class called transactions
  d <- as(df, "transactions")
  itemFrequencyPlot(d)

Step3: Generate association arules

  rules <- apriori(df,parameter=list(support=0.01,# support indicate how frequently iterms in LHS and RHS occur together
                                     confidence=0.5))  # confidence indicate how often the rule has found to be true
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 1 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[21 item(s), 154 transaction(s)] done [0.00s].
## sorting and recoding items ... [21 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.00s].
## writing ... [346 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
  # check the rules
  summary(rules)
## set of 346 rules
## 
## rule length distribution (lhs + rhs):sizes
##   2   3   4   5   6 
##  26 150 124  40   6 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   3.000   3.000   3.566   4.000   6.000 
## 
## summary of quality measures:
##     support          confidence          lift            count      
##  Min.   :0.01299   Min.   :0.5000   Min.   : 1.100   Min.   : 2.00  
##  1st Qu.:0.01299   1st Qu.:0.6667   1st Qu.: 2.200   1st Qu.: 2.00  
##  Median :0.01299   Median :1.0000   Median : 5.704   Median : 2.00  
##  Mean   :0.02104   Mean   :0.8667   Mean   : 7.753   Mean   : 3.24  
##  3rd Qu.:0.01948   3rd Qu.:1.0000   3rd Qu.:12.833   3rd Qu.: 3.00  
##  Max.   :0.22078   Max.   :1.0000   Max.   :19.250   Max.   :34.00  
## 
## mining info:
##  data ntransactions support confidence
##    df           154    0.01        0.5
  # visualize rules
  plot(rules)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

  # pick the rules whose lift is higher than 18
  betterrules <- rules[quality(rules)$lift > 18] # "lift" serves as ameasure of interestingness of "support" and "confidence"
  # check better rules
  summary(betterrules)
## set of 56 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2  3  4  5  6 
##  2 16 24 12  2 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   3.000   4.000   3.929   4.250   6.000 
## 
## summary of quality measures:
##     support          confidence      lift           count      
##  Min.   :0.01299   Min.   :1    Min.   :19.25   Min.   :2.000  
##  1st Qu.:0.01299   1st Qu.:1    1st Qu.:19.25   1st Qu.:2.000  
##  Median :0.01299   Median :1    Median :19.25   Median :2.000  
##  Mean   :0.01693   Mean   :1    Mean   :19.25   Mean   :2.607  
##  3rd Qu.:0.01948   3rd Qu.:1    3rd Qu.:19.25   3rd Qu.:3.000  
##  Max.   :0.05195   Max.   :1    Max.   :19.25   Max.   :8.000  
## 
## mining info:
##  data ntransactions support confidence
##    df           154    0.01        0.5
  # check the good rules one by one use function inspect
  inspect(betterrules)
##      lhs                                 rhs        support    confidence
## [1]  {series}                         => {time}     0.05194805 1         
## [2]  {time}                           => {series}   0.05194805 1         
## [3]  {computing,package}              => {parallel} 0.01298701 1         
## [4]  {computing,users}                => {parallel} 0.01298701 1         
## [5]  {code,series}                    => {time}     0.01298701 1         
## [6]  {code,time}                      => {series}   0.01298701 1         
## [7]  {series,slides}                  => {time}     0.01298701 1         
## [8]  {slides,time}                    => {series}   0.01298701 1         
## [9]  {examples,series}                => {time}     0.01298701 1         
## [10] {examples,time}                  => {series}   0.01298701 1         
## [11] {series,users}                   => {time}     0.01298701 1         
## [12] {time,users}                     => {series}   0.01298701 1         
## [13] {analysis,series}                => {time}     0.02597403 1         
## [14] {analysis,time}                  => {series}   0.02597403 1         
## [15] {mining,series}                  => {time}     0.01948052 1         
## [16] {mining,time}                    => {series}   0.01948052 1         
## [17] {r,series}                       => {time}     0.03246753 1         
## [18] {r,time}                         => {series}   0.03246753 1         
## [19] {computing,package,r}            => {parallel} 0.01298701 1         
## [20] {computing,r,users}              => {parallel} 0.01298701 1         
## [21] {code,examples,series}           => {time}     0.01298701 1         
## [22] {code,examples,time}             => {series}   0.01298701 1         
## [23] {code,r,series}                  => {time}     0.01298701 1         
## [24] {code,r,time}                    => {series}   0.01298701 1         
## [25] {examples,r,series}              => {time}     0.01298701 1         
## [26] {examples,r,time}                => {series}   0.01298701 1         
## [27] {analysis,series,users}          => {time}     0.01298701 1         
## [28] {analysis,time,users}            => {series}   0.01298701 1         
## [29] {mining,series,users}            => {time}     0.01298701 1         
## [30] {mining,time,users}              => {series}   0.01298701 1         
## [31] {r,series,users}                 => {time}     0.01298701 1         
## [32] {r,time,users}                   => {series}   0.01298701 1         
## [33] {analysis,mining,series}         => {time}     0.01948052 1         
## [34] {analysis,mining,time}           => {series}   0.01948052 1         
## [35] {analysis,r,series}              => {time}     0.01948052 1         
## [36] {analysis,r,time}                => {series}   0.01948052 1         
## [37] {mining,r,series}                => {time}     0.01948052 1         
## [38] {mining,r,time}                  => {series}   0.01948052 1         
## [39] {analysis,mining,users}          => {series}   0.01298701 1         
## [40] {analysis,mining,r}              => {series}   0.01948052 1         
## [41] {analysis,mining,users}          => {time}     0.01298701 1         
## [42] {analysis,mining,r}              => {time}     0.01948052 1         
## [43] {code,examples,r,series}         => {time}     0.01298701 1         
## [44] {code,examples,r,time}           => {series}   0.01298701 1         
## [45] {analysis,mining,series,users}   => {time}     0.01298701 1         
## [46] {analysis,mining,time,users}     => {series}   0.01298701 1         
## [47] {analysis,r,series,users}        => {time}     0.01298701 1         
## [48] {analysis,r,time,users}          => {series}   0.01298701 1         
## [49] {mining,r,series,users}          => {time}     0.01298701 1         
## [50] {mining,r,time,users}            => {series}   0.01298701 1         
## [51] {analysis,mining,r,series}       => {time}     0.01948052 1         
## [52] {analysis,mining,r,time}         => {series}   0.01948052 1         
## [53] {analysis,mining,r,users}        => {series}   0.01298701 1         
## [54] {analysis,mining,r,users}        => {time}     0.01298701 1         
## [55] {analysis,mining,r,series,users} => {time}     0.01298701 1         
## [56] {analysis,mining,r,time,users}   => {series}   0.01298701 1         
##      lift  count
## [1]  19.25 8    
## [2]  19.25 8    
## [3]  19.25 2    
## [4]  19.25 2    
## [5]  19.25 2    
## [6]  19.25 2    
## [7]  19.25 2    
## [8]  19.25 2    
## [9]  19.25 2    
## [10] 19.25 2    
## [11] 19.25 2    
## [12] 19.25 2    
## [13] 19.25 4    
## [14] 19.25 4    
## [15] 19.25 3    
## [16] 19.25 3    
## [17] 19.25 5    
## [18] 19.25 5    
## [19] 19.25 2    
## [20] 19.25 2    
## [21] 19.25 2    
## [22] 19.25 2    
## [23] 19.25 2    
## [24] 19.25 2    
## [25] 19.25 2    
## [26] 19.25 2    
## [27] 19.25 2    
## [28] 19.25 2    
## [29] 19.25 2    
## [30] 19.25 2    
## [31] 19.25 2    
## [32] 19.25 2    
## [33] 19.25 3    
## [34] 19.25 3    
## [35] 19.25 3    
## [36] 19.25 3    
## [37] 19.25 3    
## [38] 19.25 3    
## [39] 19.25 2    
## [40] 19.25 3    
## [41] 19.25 2    
## [42] 19.25 3    
## [43] 19.25 2    
## [44] 19.25 2    
## [45] 19.25 2    
## [46] 19.25 2    
## [47] 19.25 2    
## [48] 19.25 2    
## [49] 19.25 2    
## [50] 19.25 2    
## [51] 19.25 3    
## [52] 19.25 3    
## [53] 19.25 2    
## [54] 19.25 2    
## [55] 19.25 2    
## [56] 19.25 2
  # generate another rulset, setting the RHS to be exactly "analysis"
  rules2 <- apriori(df,parameter=list(support=0.01,confidence=0.5),
                    # support indicate how frequently iterms in LHS and RHS occur together
                    # confidence indicate how often the rule has found to be true
                    appearance = list(default="lhs",rhs=c("analysis"))) # set RHS to be "analysis"
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 1 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[21 item(s), 154 transaction(s)] done [0.00s].
## sorting and recoding items ... [21 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.00s].
## writing ... [45 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
  # check ruleset
  summary(rules2)
## set of 45 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2  3  4  5  6 
##  4 19 15  6  1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   3.000   3.000   3.578   4.000   6.000 
## 
## summary of quality measures:
##     support          confidence          lift           count       
##  Min.   :0.01299   Min.   :0.5000   Min.   :3.348   Min.   : 2.000  
##  1st Qu.:0.01299   1st Qu.:0.6667   1st Qu.:4.464   1st Qu.: 2.000  
##  Median :0.01299   Median :1.0000   Median :6.696   Median : 2.000  
##  Mean   :0.01919   Mean   :0.8520   Mean   :5.705   Mean   : 2.956  
##  3rd Qu.:0.01948   3rd Qu.:1.0000   3rd Qu.:6.696   3rd Qu.: 3.000  
##  Max.   :0.07792   Max.   :1.0000   Max.   :6.696   Max.   :12.000  
## 
## mining info:
##  data ntransactions support confidence
##    df           154    0.01        0.5
  # visualize rules2
  plot(rules2)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.