Step 1: Load the Data

library(gdata)
library(readxl)
 # 1) Read the data
  # read in the excel file using read.xls function in gdata package
mydata <- read_excel("~/Dropbox/Applied Data Science (IST 687)/Datasets/MedianZIP.xlsx")
  # 2) clean up the data
  # change column names to "zip", "Median","Mean", and "Population"
  colnames(mydata) <- c("zip", "Median", "Mean", "Population")
  
  # delete the first row of the dataframe
  mydata <- mydata[-1,]
  
  # gsub function is used to perform replacement of matches determined by regular expression matching
  numberize <- function(v)
  {
    v <- gsub(",", "", v)
    v <- as.numeric(v)
  }
  
  mydata <- data.frame(sapply(mydata, numberize))
  
  
  
  # 3)  Load the "zipcodez" package
  # install.packages("zipcode")
  library(zipcode)
  # load a dataframe that contains city, state, latitude, and longitude for U.S. ZIP codes
  data(zipcode)
  # clean up and standardize ZIP codes in mydata
  mydata$zip <- clean.zipcodes(mydata$zip)

  # 4)  Merge the zip code infoxrmation
  # merge mydata and zipcode by the common column "zip" and store the new dataframe into "dfNew"
  dfNew <- merge(mydata, zipcode, by="zip")

  # 5)  Remove Hawaii and Alaska
  # remove Hawaii and Alaska from dfNew
  dfNew <- dfNew[which(dfNew$state != "AK" & dfNew$state != "HI"), ]

Step 2: Show the income & population per state

 # 1) Create a simpler dataframe with just the average median income and the population for each state
  
  # use tapply  to calculate average median income for each state 
  # and store the results in a new dataframe called "medianIncome"
  income <- tapply(dfNew$Median, dfNew$state, mean )
  state <- rownames(income)
  medianIncome <- data.frame(state, income)
  
  #calculate total popuation in each state
  pop <- tapply(dfNew$Population, dfNew$state, sum )
  state <- rownames(pop)
  statePop <- data.frame(state, pop)
  
  # merge "medianIncome" and "statePop" by common column "states"
  dfSimple <- merge(medianIncome, statePop, by="state")
  
  # 2) Add the state abbreviations and the state names as new columns 
  #get the state name (not just the abbreviations)
  dfSimple$stateName <- state.name[match(dfSimple$state,state.abb)]
  # convert "stateName" to lower case
  dfSimple$stateName <- tolower(dfSimple$stateName)
  
    
  # 3) Show the U.S. map, representing the color with the average median income of that state
  # install.packages("ggplot2")
  library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
  # install.packages("ggmap")
  library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
  # get the data on the "state" to be mapped 
  us <- map_data("state")
  
  # use "dfSimple" to create map and set "stateName" as map ID
  mapIncome <- ggplot(dfSimple, aes(map_id = stateName))
  # represent the color with average median income of each states
  mapIncome <- mapIncome + geom_map(map = us, aes(fill = dfSimple$income))
  # change the limits of x and y axes to print the whole map
  mapIncome <- mapIncome + expand_limits(x = us$long, y = us$lat)
  # make sure the map is not stretched
  mapIncome <- mapIncome + coord_map()
  # add a tile for the map
  mapIncome <- mapIncome + ggtitle("average median Income of the U.S") 
  # plot the income map
  mapIncome

  # 4) Show the U.S. map, with color representing the population of the state
  # use "dfSimple" to create map and set "stateName" as map ID
  mapPop <- ggplot(dfSimple, aes(map_id = stateName))
  # represent the color with population of each states
  mapPop <- mapPop + geom_map(map=us, aes(fill = dfSimple$pop))
  # change the limits of x and y axes to print the whole map
  mapPop <- mapPop + expand_limits(x = us$long, y = us$lat)
  # make sure the map is not stretched and add a title for the map
  mapPop <- mapPop + coord_map() + ggtitle("Population of the U.S")
  # plot the map
  mapPop