#install.packages("RCurl")
#install.packages("RJSONIO")

library(RCurl)
## Warning: package 'RCurl' was built under R version 3.5.2
## Loading required package: bitops
library(RJSONIO)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: /Library/Frameworks/R.framework/Versions/3.5/Resources/modules/R_X11.so
##   Reason: image not found
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/
## Resources/library/tcltk/libs//tcltk.so'' had status 69
## Could not load tcltk.  Will use slower R code instead.
## Loading required package: RSQLite
#library(jsonlite)

Step 1: Load the data

 # get data from the given URL
 #dataset <- getURL("https://opendata.maryland.gov/api/views/pdvh-tf2u/rows.json?accessType=DOWNLOAD")
dataset <- "https://opendata.maryland.gov/api/views/pdvh-tf2u/rows.json?accessType=DOWNLOAD"

# covert the JSON format dataset into R objects
mydata <- fromJSON(dataset, nullValue=NA,simplify = FALSE) # with RJSONIO
#mydata <- fromJSON(dataset) # with jsonlite
# detach("package:RJSONIO", unload=TRUE) to unload a package

 # look into the data summary
summary(mydata)
##      Length Class  Mode
## meta     1  -none- list
## data 18638  -none- list

Step 2: Clean the data

 # get rid of metadata and only keep the second element in the list "mydata"
 myList <- mydata[[2]]
 # count how many elements are in "myList" and store the number into variable "numRows"
 numRows <- length(myList)
 # If we didn't have nullValue = NA, then we would need to do the following
 # Test every elements in "myList" one by one, for each element:
 # First, record the element into a temporary variable "record",
 # Second, if the original element is NULL, assign NA to "record". If not, keep the original value.
 # Third, re-assign the value of "record" to the element in "myList"
 #for(i in 1:numRows) { 
 #   record <- myList[[i]] 
 #   record[sapply(record, is.null)] <- NA 
 #   myList[[i]] <- record
 #}
 
 # Unlist "myList" and transform it to a dataframe, whose number of rows is the value of "numRows"
 
 df <- data.frame(matrix(unlist(myList), nrow=numRows, byrow=T), stringsAsFactors = FALSE)

 # remove the first 8 columns
 df <- df[, -1:-8]
 # rename the rest of columns
 colnames(df) <- c("CASE_NUMBER","BARRACK","ACC_DATE","ACC_TIME","ACC_TIME_CODE","DAY_OF_WEEK","ROAD","INTERSECT_ROAD","DIST_FROM_INTERSECT","DIST_DIRECTION","CITY_NAME","COUNTY_CODE","COUNTY_NAME","VEHICLE_COUNT","PROP_DEST","INJURY","COLLISION_WITH_1","COLLISION_WITH_2")

Step 3: Understand the data using SQL (via SQLDF)

 # how many accidents happen on SUNDAY
 # Use sql to count how many obersavations meet the criterion that DAY_OF_WEEK is "SUNDAY"
# TRIM remove leading and trailing spaces from a string
 sun_acc <- sqldf("select count(DAY_OF_WEEK) from df where TRIM(DAY_OF_WEEK) = 'SUNDAY'")
 # Print the result
 print(sun_acc)
##   count(DAY_OF_WEEK)
## 1               2373
 # how many accidents had injuries(might need to remove NAs from the data)
 # Use sql to count how many obersavations meet the criterion that INJURY value is "YES"
 inj_acc <- sqldf("select count(INJURY) from df where INJURY='YES'")
 # Print the result
 print(inj_acc)
##   count(INJURY)
## 1          6433
 # list the injuries by day
 # count the number of injuries for each day of the week
 list_inj <- sqldf("select DAY_OF_WEEK, count(CASE_NUMBER) as Totalnumber from df where INJURY='YES' GROUP BY TRIM(DAY_OF_WEEK)")
 # Print the result
 print(list_inj)
##   DAY_OF_WEEK Totalnumber
## 1   FRIDAY           1043
## 2   MONDAY            915
## 3   SATURDAY          950
## 4   SUNDAY            818
## 5   THURSDAY          968
## 6   TUESDAY           843
## 7   WEDNESDAY         896

Step 4: Understand the data using tapply

 # how many accidents happen on SUNDAY
 # replace spaces by nothing (remove all spaces) in column "DAT_OF_WEEK"
 df$DAY_OF_WEEK <- gsub(" ", "", df$DAY_OF_WEEK)
 # tapply(Summary Variable, Group Variable, Function): 
 # apply the length function on the "SUNDAY" subset of the column DAY_OF_WEEK
 tapply(df$DAY_OF_WEEK, df$DAY_OF_WEEK == "SUNDAY", length)
## FALSE  TRUE 
## 16265  2373
 # how many accidents had injuries
 # apply the length function on the "YES" subset of the column INJURY
 tapply(df$INJURY, df$INJURY == 'YES', length)
## FALSE  TRUE 
## 12204  6433
 # list the injuries by day
 # apply the length function on subset of the column INJURY broken down by the value in WEEK_OF_DAY as well as INJURY
 tapply(df$INJURY=="YES", df$DAY_OF_WEEK, sum, na.rm=TRUE)
##    FRIDAY    MONDAY  SATURDAY    SUNDAY  THURSDAY   TUESDAY WEDNESDAY 
##      1043       915       950       818       968       843       896