IRIS EXAMPLE THROUGH VARIOUS CLASSIFICATION ALGORITHMS
library(caret)
dataset <- iris
head(dataset)
tail(dataset)
validation_index <- createDataPartition(dataset$Species, p=0.80, list=FALSE)
validation <- dataset[-validation_index,]
dataset <- dataset[validation_index,]
dim(dataset)
sapply(dataset, class)
levels(dataset$Species)
percentage <- prop.table(table(dataset$Species)) * 100
cbind(freq=table(dataset$Species), percentage=percentage)
summary(dataset)
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"
set.seed(7)
fit.lda <- train(Species~., data=dataset, method="lda", metric=metric, trControl=control)
set.seed(7)
fit.cart <- train(Species~., data=dataset, method="rpart", metric=metric,
trControl=control)
set.seed(7)
fit.knn <- train(Species~., data=dataset, method="knn", metric=metric,
trControl=control)
results <- resamples(list(lda=fit.lda,rpart=fit.cart, knn=fit.knn))
summary(results)
dotplot(results)
KNN ALGORITHM
#knn
wbcd <- read.csv("wisc_bc_data.csv", stringsAsFactors = FALSE)
str(wbcd)
wbcd <- wbcd[-1] #removing 1st column from dataset
table(wbcd$diagnosis) #checking the number of categories in categorical variable
wbcd$diagnosis<- factor(wbcd$diagnosis, levels = c("B", "M"),labels = c("Benign", "Malignant")) #making factor to categorical variable
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1) #prop.table gives the %age of categories in categorical variable
summary(wbcd[c("radius_mean", "area_mean", "smoothness_mean")])
#here we need to check whether we need to normalize our data or not
#we can use inbuilt scale function for normalizing it is based on z values
#wbcd_z <- as.data.frame(scale(wbcd[-1]))
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize)) #lapply() is used for applying certain function to all columns
wbcd_train <- wbcd_n[1:469, ]
wbcd_test <- wbcd_n[470:569, ] #we can use sample for sampling randomly data
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]
install.packages("class") #having knn function
library(class)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test,cl = wbcd_train_labels, k = 21)
install.packages("gmodels")
library(gmodels) #having crosstable fuction
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred,prop.chisq=FALSE)
NAIVE BAYES
buy<-c("yes","no","no","yes")
income<-c("high","high","medium","low")
gender<-c("male","female","female","male")
dt<-data.frame(buy,income,gender,stringsAsFactors = TRUE)
str(dt)
table(dt$buy)
dt_samp<-sample(4,3)
dt_test<-dt[-dt_samp,]
dt_train<-dt[dt_samp,]
prop.table(table(dt_test))
prop.table(table(dt_train))
library(e1071)
dt_model <- naiveBayes(dt_train,dt_train$buy)
d_pred<-predict(dt_model, dt_train)
dt_model
d_pred
cbind(dt_train,d_pred)
library(gmodels)
CrossTable(d_pred,dt_train$buy)
DECISION TREE'S
credit <- read.csv("bank.csv")
str(credit)
table(credit$balance)
head(credit)
summary(credit)
table(credit$default)
set.seed(123)
train_sample <- sample(4521, 4070)
str(train_sample)
credit_train <- credit[train_sample, ]
credit_test <- credit[-train_sample, ]
str(credit_test)
prop.table(table(credit_train$default))
prop.table(table(credit_test$default))
#install.packages("C50")
#library(C50)
credit_model<-C5.0(credit_train,credit_train$loan)
credit_prediction <- predict(credit_model,credit_test)
summary(credit_model)
credit_pred <- predict(credit_model, credit_test)
library(gmodels)
CrossTable(credit_test$default, credit_pred,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))
##################################3
library(caret)
dataset <- iris
str(dataset)
set.seed(7)
train_s<-sample(150,100)
str(train_s)
iris_tr<-iris[train_s,]
iris_te<-iris[-train_s,]
str(iris_te)
prop.table(table(iris_tr$Species))
prop.table(table(iris_te$Species))
iris_model<-C5.0(iris_tr,iris_tr$Species)
iris_prediction<-predict(iris_model,iris_te)
summary(iris_model)
library(gmodels)
CrossTable(iris_te$Species, iris_prediction,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))
percentage <- prop.table(table(dataset$Species)) * 100
cbind(freq=table(dataset$Species), percentage=percentage)
summary(dataset)
LINEAR REGRESSION
ID<-c(1,2,3,4,5,6,7,8,9,10)
HEIGH<-c(5,5.11,5.6,5.9,4.8,5.8,5.3,5.8,5.5,5.6)
AGE<-c(45,26,30,34,40,36,19,28,23,32)
WEIGH<-c(77,47,55,59,72,60,40,60,45,58)
d<-data.frame(ID,HEIGH,AGE,WEIGH,stringsAsFactors = FALSE)
d_test<-data.frame(ID=11,HEIGH=5.5,AGE=38,WEIGH=NA,stringsAsFactors = FALSE)
str(d)
summary(d$WEIGH)
cor(d[c("HEIGH","AGE","WEIGH")])
pairs(d[c("HEIGH","AGE","WEIGH")])
pairs.panels(d[c("HEIGH","AGE","WEIGH")])
#library(stats)
ins_model <- lm(WEIGH ~ HEIGH + AGE, data = d)
ins_model
predict(ins_model,d_test)
#predict(ins_model,d[10,])
summary(WEIGH)
var(WEIGH)
sd(WEIGH)
NEURAL NETWORK
#library(neuralnet)
concrete <- read.csv("Concrete_Data.csv")
str(concrete)
normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}
concrete_norm <- as.data.frame(lapply(concrete, normalize))
summary(concrete_norm$strength)
summary(concrete$strength)
concrete_train <- concrete_norm[1:773, ]
concrete_test <- concrete_norm[774:1030, ]
concrete_model <- neuralnet(strength ~ cement + slag + ash + water + superplastic + coarseagg + fineagg + age, data = concrete_train)
plot(concrete_model)
model_results <- compute(concrete_model, concrete_test[1:8])
predicted_strength <- model_results$net.result
cor(predicted_strength, concrete_test$strength)
concrete_model2 <- neuralnet(strength ~ cement + slag +
ash + water + superplastic +
coarseagg + fineagg + age,
data = concrete_train, hidden = 5)
#plot(concrete_model2)
model_results2 <- compute(concrete_model2, concrete_test[1:8])
predicted_strength2 <- model_results2$net.result
cor(predicted_strength2, concrete_test$strength)
SUPPORT VECTOR MACHINE(KSVM)
#install.packages("kernlab")
#install.packages("caret")
#install.packages("knitr")
letters <- read.csv("letterdata.csv")
str(letters)
kable(head(letters),caption="obr")
letters_train <- letters[1:16000, ]
letters_test <- letters[16001:20000, ]
letter_classifier <- ksvm(lettr ~ ., data = letters_train,kernel = "vanilladot")
letter_classifier
letter_predictions <- predict(letter_classifier, letters_test)
head(letter_predictions)
table(letter_predictions, letters_test$lettr)
agreement <- letter_predictions == letters_test$lettr
table(agreement)
prop.table(table(agreement))
#improving performance
letter_classifier_rbf <- ksvm(lettr ~ ., data = letters_train,kernel = "rbfdot")
letter_predictions_rbf <- predict(letter_classifier_rbf,letters_test)
agreement_rbf <- letter_predictions_rbf == letters_test$lettr
table(agreement_rbf)
prop.table(table(agreement_rbf))
SUPPORT VECTOR MACHINE(SVM)
library(e1071)
iris
plot(iris)
plot(iris$Sepal.Length,iris$Sepal.Width, col = iris$Species)
plot(iris$Petal.Length,iris$Petal.Width, col = iris$Species)
s <- sample(150,100)
col = c("Petal.Length", "Petal.Width", "Species")
iris_train = iris[s,col]
iris_test = iris[-s,col]
svmfit = svm(Species~.,data = iris_train, kernel= "linear",cost = 0.1, scale = FALSE)
print(svmfit)
plot(svmfit, iris_train[,col])
tuned = tune(svm,Species~.,data = iris_train, kernel= "linear", ranges = list(cost = c(0.001,0.01,.1,1,10,100)))
summary(tuned)
p = predict(svmfit,data = iris_test[,col],type = "class")
plot(p)
table(p,iris_train[,3])
mean(p==iris_test[,3])
MARKET BASKET ANALYSIS
install.packages("arules")
library(arules)
#groceries <- read.transactions("Groceries.csv",sep = ",")
summary(Groceries)
inspect(Groceries[1:5])
groceries<-Groceries
itemFrequency(groceries[, 1:3])
itemFrequencyPlot(groceries, support = 0.1)
itemFrequencyPlot(groceries, topN = 20)
image(groceries[1:5])
image(sample(groceries, 100))
groceryrules <- apriori(groceries, parameter = list(support =0.006, confidence = 0.25, minlen = 2))
groceryrules
inspect(groceryrules[1:3])
inspect(sort(groceryrules, by = "lift")[1:5])
berryrules <- subset(groceryrules, items %in% "berries")
sodarules <- subset(groceryrules,rhs%pin% "soda") #for if we want soda in rhs
inspect(sodarules)
inspect(berryrules)
K-MEANS CLUSTERING
library(stats)
uti<-read.csv("UTILITIES.csv")
str(uti)
summary(uti)
pairs(uti)
plot(uti$Fuel_Cost ~ uti$Sales,data=uti)
with(uti,text(uti$Fuel_Cost ~ uti$Sales,labels=uti$Company))
z=uti[,-c(1,1)]
#z=uti[-1]
str(z)
means=apply(z,2,mean) #in apply(datasetname,1=for all rows & 2=for all columns,operation)
sdd=apply(z,2,sd)
nor=scale(z,center = means,scale=sdd)
kc=kmeans(nor,3)
kc$cluster
kc$centers
plot(Sales ~ Demand_growth,uti,col=kc$cluster)
legend("topright",inset = .01,title = "Cluster Colors",legend = unique(kc$cluster),fill=unique(kc$cluster))
#hierirical distance
distance = dist(nor)
print(distance,digits=3)
clust<-hclust(distance)
DPLYR
library(readxl)
global <- read_xlsx("GLOBAL SUPER STORE 2016 SALES.xlsx")
str(global)
View(global)
library(dplyr)
names(global)
global %>% select(Country,Region,Product_Name) %>% filter(Region =="Southern Asia" & Product_Name=="Advantus Clock, Erganomic" )
global
str(global)
mean(global$Profit)
-------------------------------------------------------------------------------------------------------------
head(hflights)
tail(hflights)
data<-select(hflights,FlightNum,ArrTime,DepTime)
head(data)
head(select(hflights,1:4))
head(select(hflights,5,8))
head(select(hflights,starts_with("Year"):ends_with("ArrTime")))
head(select(hflights,contains("Time")))
head(select(hflights,starts_with("Day"),ends_with("Time")))
f1<-filter(hflights,Distance>3000)
range(f1$Distance)
mutate(hflights,(TaxiOut-TaxiIn)>AirTime)
filter(hflights,DepTime < 500 & ArrTime > 2200)
filter(hflights,Dest=="JFK" & Cancelled==1)
head(mutate(hflights,TaxiOut-TaxiIn))
head(mutate(hflights,ArrDelay+DepDelay))
head(mutate(hflights,avgSpeed=Distance/AirTime))
head(mutate(hflights,avgGroundTime=(ArrTime+ActualElapsedTime)/2))
filter(hflights,UniqueCarrier=='OO' | UniqueCarrier=='AA' | UniqueCarrier=='US')
group_by(hflights,AirTime)
group_by(hflights,ArrDelay+DepDelay)
PRACTICE
#vectors
x<-c(1,3,5,7)
y<-c(1,'a',7.1,sin)
x
y
e=seq(from=1,to=8,by=2)
e
rep(1:5,times=13)
matrix(rep(1:5,times=4),nrow=4,ncol=4,byrow="true")
# Create a vector.
apple <- c('red','green',"yellow")
print(apple)
# Get the class of the vector.
print(class(apple))
list1 <- list(c(2,5,3),21.3,sin)
# Print the list.
print(list1)
# Create a matrix.
M = matrix( c('a','a','b','c','b','a'), nrow = 2, ncol = 3, byrow = TRUE)
print(M)
#rep
rep("abhay",times=3)
#rep and seq
rep(seq(from=2,to=19,by=2),times=3)
x<-11:15
print(x[-3])
mat<-matrix(1:9,nrow=3,byrow = TRUE)
mat[1,3]
data<-read.xls(file.choose(),header = TRUE)
data
dim(data)
sapply(data,class)
levels(data$clg)
table(data$clg)
percentage1 <- prop.table(table(data$clg)) * 100
cbind(freq=table(data$clg), percentage=percentage1)
data
summary(data)
data<-read.delim(file.choose(),header = T)
data
tail(data)
head(data)
dim(data)
summary(data)
mean(data$Age)
#if we directly want to access Age varible firstly we need to attach the data to R
#like attach(data) ..after writing this command if we try to directly use age,we can do it.
names(data)
#creating vector as factor
x<-c(0,1,1,1,1,0)
summary(x)
x<-as.factor(x)
summary(x)
data<-data.frame(
rollnum=c(1:4),
name=c("a","b","c","d"),
cgpa=c(seq(from=9.25,to=10,by=0.25)),
start_date = as.Date(c("2012-01-01", "2014-11-15", "2014-05-11",
"2015-03-27")),
stringsAsFactors = FALSE
)
data
data[which(data$cgpa>9.5 & data$start_date>"2015-01-01"),]
library("MASS")
ships
molten.ships <- melt(ships, id = c("type","year"))
print(molten.ships)
recasted.ship <- cast(molten.ships, type+year~variable,sum)
print(recasted.ship)
x <- c(21, 62, 10, 53)
labels <- c("London", "New York", "Singapore", "Mumbai")
pie(x,labels,main="city pie chart",col = rainbow(length(x)))
piepercent<- round(100*x/sum(x), 1)
d<-paste(labels,piepercent,sep=" ")
pie(x, labels = d, main = "City pie chart",col = rainbow(length(x)))
legend("topright", c("London","New York","Singapore","Mumbai"), cex = 0.8,
fill = rainbow(length(x)))
colors = c("green","orange","brown")
months <- c("Mar","Apr","May","Jun","Jul")
regions <- c("East","West","North")
Values <- matrix(c(2,9,3,11,9,4,8,7,3,12,5,2,8,10,11), nrow = 3, ncol = 5, byrow = TRUE)
barplot(Values, main = "total revenue", names.arg = months, xlab = "month",
ylab = "revenue", col = colors)
# Add the legend to the chart
legend("topleft", regions, cex = 1.3, fill = colors)
boxplot(mpg ~ cyl, data = mtcars, xlab = "Number of Cylinders",
ylab = "Miles Per Gallon", main = "Mileage Data")
v <- c(9,13,21,8,36,22,12,41,31,33,19)
hist(v,xlab = "Weight",col = "yellow",border = "blue")
hist(v,xlab = "Weight",col = "green",border = "red", xlim = c(0,40), ylim = c(0,5))
# Create the data for the chart.
v <- c(7,12,28,3,41)
plot(v,type = "o", col = "red", xlab = "Month", ylab = "Rain fall",
main = "Rain fall chart")
t <- c(14,7,6,19,3)
lines(t, type = "o", col = "blue")
# Get the input values.
input <- mtcars[,c('wt','mpg')]
plot(x = input$wt,y = input$mpg,
xlab = "Weight",
ylab = "Milage",
xlim = c(2.5,5),
ylim = c(15,30),
main = "Weight vs Milage"
)
data1=data.frame(rollnum=5,name="e",cgpa=10.25
,start_date = as.Date("2012-08-04"),
stringsAsFactors = FALSE)
#armstrong number
# take input from the user
arm<-function(num){
# initialize sum
sum = 0
# find the sum of the cube of each digit
temp = num
while(temp > 0) {
digit = temp %% 10
sum = sum + (digit ^ 3)
temp = temp %/% 10
}
# display the result
if(num == sum) {
print(paste(num, "is an Armstrong number"))
} else {
print(paste(num, "is not an Armstrong number"))
}
}
arm(370)
library(dplyr)
a<-data.frame(q=c(1,2),w=c(2,3))
b<-data.frame(q=12,w=45)
a<-bind_rows(a,b)
a
x<-mtcars[,mtcars$mpg]
y<-mtcars[4:6]
print(cor(x,y))
Thanks for checking out.please give a thumbs up if you got benefit.