DATA CLEANING


#airquality is a dataset from the package "datasets"
>airqual=airquality

# summary to check the missing values, mean and median of the data distributions
>summary(airquality)

# we can see their are 37 observations missing in Ozone for the variable and 7 observation missing
# in Solar.R # Checking all the obsrvations with missing values
>airqual[!complete.cases(airqual),]

# Checking the outliers with boxplot
>boxplot(airqual)
# we can see the variables Ozone and Wind has outliers.

# Steps to clean the data by removing outliers and treating missing values.
# a) Mean is badly affected by outliers, hence to impute missing values with mean we have to remove outliers.
# b) The outliers are removed and data is stored in new dataset called Updated_airqual.
# c) The purpose of Updated_airqual dataset is to impute missing values in airqual dataset.
# d) Impute the missing values in airqual dataset with respective mean from Updated_airqual dataset.
# e) Now airqual dataset has no missing values, so we can remove outliers as we did in the step-b
# f) Data is ready for analysis.

# Lets exclude the outliers for this two variables
>boxplot(airqual$Ozone,horizontal = TRUE)

# we see that observation above 120 could be outlier, hence we will consider all below 120

# Similarly for ozone
>boxplot(airqual$Wind,horizontal = TRUE)

# We can see that observations above 17 are outliers, hence we will consider all below 17

# therefore removing the outliers
>Updated_airqual=subset(airqual,Ozone<130 & Wind<17)

# lets check if all the outliers are removed
>boxplot(Updated_airqual)

# important: Using subset will also remove all the NA's present in the variable selected hence reducing the dataset.
# So after removing the outliers we have to fill missing values in the original dataset("airquality") to restore.

# Let impute the missing values in the original dataset with "mean" of the respective variables.
>airqual$Ozone[is.na(airqual$Ozone)]<-mean(Updated_airqual$Ozone)

# Check summary if their are any mising values in airquality data
>summary(airqual)

# There are no NA's for Ozone.

# Similarly we have to do the same with Wind variable.
>airqual$Solar.R[is.na(airqual$Solar.R)]<-mean(Updated_airqual$Solar.R,na.rm = TRUE)

# Check summary if all the missing values are treated with mean.
>summary(airqual)

# Now we remove the outliers from the airqual dataset.
>data_airquality=subset(airqual,Ozone<70 & Wind<17)
>boxplot(data_airquality)
>nrow(data_airquality)

>data_airquality=subset(airqual,Ozone<70 & Wind<17 & Wind>2)
>boxplot(data_airquality)