Step 1: Create a function (named readStates) to read a CSV file into R

create a function called “readStates” that can read in a CSV file from a URL

 readStates <- function(Statefile)
 {
   dataset <- read.csv(url(Statefile))
   # return the results
   return(dataset)
 }
 # use the function to read the URL
States <- readStates("https://www2.census.gov/programs-surveys/popest/tables/2010-2011/state/totals/nst-est2011-02.csv")

Step 2: Clean the dataframe

 # 3. Fix the issues (removing columns, removing rows, changing column names)
 # remove useless colums and rows (only keep row 10 through 59 and column 1 through 5)
 clStates <- States[(10:59), (1:5)]
 # change names for the left five columns
 colnames(clStates)<-c("stateName", "census2010", "base2010","populationchange","percentchange")
 
 # 4 & 5
 # remove "." in stateName column (replace "." by nothing)
 # note that "." must be escaped using backslashes, or everything in the column would be replace by nothing
 clStates$stateName<-gsub("\\.","",clStates$stateName)
 # check the numbers of rows and columns, and data types for each column
 str(clStates)
## 'data.frame':    50 obs. of  5 variables:
##  $ stateName       : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ census2010      : Factor w/ 60 levels "","1,052,567",..: 32 50 41 19 28 33 23 54 45 13 ...
##  $ base2010        : Factor w/ 59 levels "","1,051,302",..: 32 50 41 19 28 33 23 57 45 13 ...
##  $ populationchange: Factor w/ 60 levels "","-1,265","-173",..: 26 11 56 24 42 53 48 54 16 30 ...
##  $ percentchange   : Factor w/ 25 levels "","-","-0.1",..: 9 21 18 12 16 20 6 14 24 18 ...
 # remove all comma in the last four columns, and convert those columns to numeirc type
 clStates$populationchange <- as.numeric(gsub(",", "", clStates$populationchange))
 clStates$census2010 <- as.numeric(gsub(",", "", clStates$census2010))
 clStates$base2010 <- as.numeric(gsub(",", "", clStates$base2010))
  clStates$percentchange <- as.numeric(as.character(clStates$percentchange))
## Warning: NAs introduced by coercion
 # re-check the data types for each column
 str(clStates)
## 'data.frame':    50 obs. of  5 variables:
##  $ stateName       : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ census2010      : num  4779735 710231 6392013 2915921 37253956 ...
##  $ base2010        : num  4802740 722718 6482505 2937979 37691912 ...
##  $ populationchange: num  23005 12487 90492 22058 437956 ...
##  $ percentchange   : num  0.5 1.8 1.4 0.8 1.2 1.7 0.2 1 2.7 1.4 ...

Step 3: Store and Explore the dataset

 # 6. Store the dataset into a dataframe called dfStates
 dfStates <- clStates

 # 7. Test your dataframe by calculating the mean for the census2010 data
 mean(dfStates$census2010)
## [1] 6163638

Step 4: Find the state with the Highest Population

# Based on the census2010 (update removed 2011) data, what is the population of the state with the highest population?
 # 8. find the highest population in column census2010, and store the population number in "highestPopulation'
 max(dfStates$census2010)
## [1] 37253956
 # get the index of highest population first, and then get its state name
 dfStates[which.max(dfStates$census2010), 1]
## [1] "California"
 # 9. Sort the data, in increasing order, based on the census2010 data
 # create a permutation which can rearrange column "census2010" into ascending order
 sortOrder <- order(dfStates$census2010)
 # rearrange the dataframe by the created permutation
 sortState <- dfStates[sortOrder,]

Step 5: Explore the distribution of the states

# 10. Write a function that takes two parameters. The first is a vector and the second is a number.
 # create a function called "Distribution" that takes two parameters
 Distribution <- function(vector,number)
 {
   # only keep the elements within the vector that are less than the number, and store the number of eligible elements into the variable "count"
   count <- length(vector[vector < number])
   # calculate the percentage and return the results
   return(count/length(vector))
 }
 
 # 12. test the function, the result should be 0.2
 Distribution(c(1,2,3,4,5), 2)
## [1] 0.2
 # 13. test the function with the vector ‘dfStates$Jul2011’, and the mean of dfStates$Jul2011’.
 Distribution(dfStates$census2010, mean(dfStates$census2010))
## [1] 0.66