Step 1: Load the data

  air <- data.frame(airquality)
  # find which columns in the dataframe contain NAs.
  colnames(air)[colSums(is.na(air)) > 0]
## [1] "Ozone"   "Solar.R"
  # find the NAs in column "Ozone" and replace them by the mean value of this column
  air$Ozone[is.na(air$Ozone)] <- mean(air$Ozone, na.rm=TRUE)
  # find the NAs in column "Solar.R" and replace those NAs by the mean value of this column
  air$Solar.R[is.na(air$Solar.R)] <- mean(air$Solar.R, na.rm=TRUE)

Step 2: Create train and test data sets

 # create a list of random index for air data and store the index in a variable called "ranIndex"
  randIndex <- sample(1:dim(air)[1])
  # In order to split data, create a 2/3 cutpoint and round the number
  cutpoint2_3 <- floor(2*dim(air)[1]/3)
  # check the 2/3 cutpoint
  cutpoint2_3
## [1] 102
  # create train data set, which contains the first 2/3 of overall data
  trainData_Corey <- air[randIndex[1:cutpoint2_3],]
  # create test data, which contains the left 1/3 of the overall data
  testData_Corey <- air[randIndex[(cutpoint2_3+1):dim(air)[1]],]
  # check train data set
  head(trainData_Corey)
##        Ozone  Solar.R Wind Temp Month Day
## 79  61.00000 285.0000  6.3   84     7  18
## 17  34.00000 307.0000 12.0   66     5  17
## 128 47.00000  95.0000  7.4   87     9   5
## 76   7.00000  48.0000 14.3   80     7  15
## 27  42.12931 185.9315  8.0   57     5  27
## 72  42.12931 139.0000  8.6   82     7  11
  # check test data set
  dim(testData_Corey)
## [1] 51  6

Step 3: Build a Model using KSVM & visualize the results

library(kernlab)
# 1) Build a model to predict Ozone and name it "svmOutput"
  svmOutput <- ksvm(Ozone~., # set "Ozone" as the target predicting variable; "." means use all other variables to predict "Ozone"
                  data = trainData_Corey, # specify the data to use in the analysis
                  kernel = "rbfdot", # kernel function that projects the low dimensional problem into higher dimensional space
                  kpar = "automatic",# kpar refer to parameters that can be used to control the radial function kernel(rbfdot)
                  C = 10, # C refers to "Cost of Constrains"
                  cross = 10, # use 10 fold cross validation in this model
                  prob.model = TRUE # use probability model in this model
                  )
  # check the model
  svmOutput
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 10 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.19837454359213 
## 
## Number of Support Vectors : 86 
## 
## Objective Function Value : -166.8313 
## Training error : 0.127022 
## Cross validation error : 444.6376 
## Laplace distr. width : 40.81469
  # 2) Test the model
  svmPred <- predict(svmOutput, # use the built model "svmOutput" to predict 
                     testData_Corey, # use testData_Corey to generate predictions
                     type = "votes" # request "votes" from the prediction process
                     )
  str(svmPred)
##  num [1:51, 1] 9.62 82.44 97.1 38.42 24.59 ...
  # create a comparison dataframe that contains the exact "Ozone" value and the predicted "Ozone" value
  compTable <- data.frame(testData_Corey[,1], svmPred[,1])
  # change the column names to "test" and "Pred"
  colnames(compTable) <- c("test","Pred")
  # comput the Root Mean Squared Error
  sqrt(mean((compTable$test-compTable$Pred)^2))
## [1] 20.18799
  # 3) Plot the results
  library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:kernlab':
## 
##     alpha
  # compute absolute error for each case
  compTable$error <- abs(compTable$test - compTable$Pred)
  # create a new dataframe contains error, tempreture and wind
  svmPlot <- data.frame(compTable$error, testData_Corey$Temp, testData_Corey$Wind)
  # assign column names
  colnames(svmPlot) <- c("error","Temp","Wind")
  # polt result using ggplot, setting "Temp" as x-axis and "Wind" as y-axis
  ggplot(svmPlot, aes(x=Temp,y=Wind)) + 
    # use point size and color shade to illustrate how big is the error
    geom_point(aes(size=error, color=error))

  # 4) Compute models and plot the results for 'svm'(in the e1071) and 'lm'
  #install.packages("e1071")
  library(e1071)
## Warning: package 'e1071' was built under R version 3.5.2
  # svm function in "e1071
  svm_e <- svm(Ozone~., # set "Ozone" as target variable,and use all other variables to predict
             data=trainData_Corey  # specify the data to use in the analysis
             )
  # check the model
  svm_e
## 
## Call:
## svm(formula = Ozone ~ ., data = trainData_Corey)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.2 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  85
  # Test the model
  svm_ePred <- predict(svm_e,# use "svm_e" to predict 
                testData_Corey # use testData_Corey to generate predictions
                )
  # create a dataframe that contains the exact "Ozone" value and the predicted "Ozone" value
  compTable2 <- data.frame(testData_Corey[,1], svm_ePred)
  # change the column names to "test" and "Pred"
  colnames(compTable2) <- c("test","Pred")
  # comput the Root Mean Squared Error
  sqrt(mean((compTable2$test-compTable2$Pred)^2))
## [1] 18.06299
  # compute absolute error for each case
  compTable2$error <- abs(compTable2$test-compTable2$Pred)
  # create a new dataframe contains error, tempreture and wind
  svm_ePlot <- data.frame(compTable2$error, testData_Corey$Temp, testData_Corey$Wind)
  colnames(svm_ePlot) <- c("error","Temp","Wind")
  # polt result using ggplot
  ggplot(svm_ePlot,aes(x=Temp,y=Wind)) + geom_point(aes(size=error,color=error))

  # lm
  # create a linear model
  lm <- lm(formula = Ozone~.,# use all the other variables to predict "Ozone"
         data=trainData_Corey # use "trainData_Corey" in this analysis
         )
  # Test the model
  predLm <- predict(lm,  # use model "svm_e" to predict 
                  testData_Corey # use testData_Corey to do the test
  )
  # create a dataframe that contains the exact "Ozone" value and the predicted "Ozone" value
  compTable3 <- data.frame(testData_Corey[,1], predLm)
  # change the column names to "test" and "Pred"
  colnames(compTable3) <- c("test","Pred")
  # comput the Root Mean Squared Error
  sqrt(mean((compTable2$test-compTable2$Pred)^2))
## [1] 18.06299
  # compute absolute error for each case
  compTable3$error <- abs(compTable3$test-compTable3$Pred)
  # create a new dataframe contains error, tempreture and wind
  lmPlot <- data.frame(compTable3$error,testData_Corey$Temp,testData_Corey$Wind)
  colnames(lmPlot) <- c("error","Temp","Wind")
  # polt result using ggplot
  ggplot(lmPlot,aes(x=Temp,y=Wind)) + geom_point(aes(size=error,color=error))