source('SMSVM_1.2.1.R')

# Example 1:
# A toy example (p=2, k=3) illustrated in OSU TR No.743. 
# x2 is not relevant to y.

######################
#       MSVM         #
######################

# generate training data
set.seed(326)
temp <- twod.data(60)
x <- as.matrix(temp$x)
y <- temp$y   

# > table(y)
# y
# 1  2  3 
# 16 25 19 


# MSVM with additive spline kernel 
# tune lambda using 10-fold CV (w.r.t. hinge loss)  
msvm.out <- msvm(x, y, lambda = seq(-16,-8,1), kernel.type = 'spline-t',
                 fold=10, cv = T)

# Output:
# The minimum of average 10 fold cross-validated hinge loss: 
# 0.6183682 
# The optimal lambda on log2 scale: -12 

# plot the MSVM classification boundary
kernel <- list(type='spline-t',par=1)
msvm.plot(x, y, kernel, msvm.out$model)

# generate test data
set.seed(327)
temp1 <- twod.data(100)
x.test <- as.matrix(temp1$x)
y.test <- temp1$y   

# prediction
fit.test <- predict.msvm(x, x.test, kernel, msvm.out$model)

# test error rate:
error.rate(y.test, fit.test)

# Output:
# > 0.48

# predicted class:
# fit.class <- apply(fit.test,1,which.max)


######################
#      SMSVM         #
######################

#-----------------#
# One-step update #
#-----------------#

# The initial c-step is, in fact, the same as MSVM.

lambda <- seq(-16,-8,1)
cstep.out <- cstep(x, y, lambda , kernel.type='spline-t',cv = T, fold = 10)

# Output:
# The minimum of 10 fold cross-validated hinge loss: 
# 0.6183682 
# The optimal lambda on log2 scale: -12 

# c-step tuning plot
plot(lambda, cstep.out$hinge, type="b", xlab=expression(log[2](lambda)), 
     ylab="cv.hinge")  


# First update 

# theta-step: feature selection --------------------------
# sequential tuning

thetastep.out <- thetastep(x, y, opt.lambda= -12, lambda.theta = seq(-10,0,1),
kernel.type='spline-t', fold = 10, isCombined = F, cv = T)

# Output:
# The minimum of average 10 fold cross-validated hinge loss: 
# 0.6095216 
# The optimal lambda.theta on log2 scale: -5 
# The number of selected features out of 2 : 1 
# The average shrinkage factor: 0.5 

# theta-step tuning plot
lambda.theta = seq(-10,0,1)
plot(lambda.theta, thetastep.out$hinge, type="b", 
     xlab=expression(log[2](lambda[theta])), ylab="cv.hinge")  

# optimal rescaling parameters
thetastep.out$opt.theta

# Output:
# [1] 1 0

# visualize theta-path as a function of lamda.theta
theta.plot(lambda.theta, thetastep.out$theta.seq)
abline(v=thetastep.out$opt.lambda.theta, lty=2)

# The first c-step based on the fitted theta -----------------

cstep1.out <- cstep(x, y, lambda , kernel.type = 'spline-t', 
theta = thetastep.out$opt.theta, cv = T, fold = 10)

# Output:
# The minimum of 10 fold cross-validated hinge loss: 
# 0.6007785 
# The optimal lambda on log2 scale: -11 

# visualize the classification boundaries.
msvm.plot(x, y, kernel, cstep1.out$model, thetastep.out$opt.theta)

fit.test <- predict.smsvm(x, x.test, kernel, cstep1.out$model,
                          thetastep.out$opt.theta)
# test error rate:
error.rate(y.test, fit.test)

# Output:
# > 0.47

#----------------------------------end of one-step update

# The SMSVM one-step update can be done easily by 'smsvm' function.
 
smsvm.out <- smsvm(x, y, kernel.type="spline-t", lambda = seq(-16,-8,1), 
lambda.theta = seq(-10,0,1), cv = T, fold = 10, isCombined = F)


# visualize the classification boundaries.
msvm.plot(x, y, kernel, smsvm.out$model, smsvm.out$opt.theta)

# prediction
fit.test <- predict.smsvm(x, x.test, kernel, smsvm.out$model, 
            smsvm.out$opt.theta)

# test error rate:
error.rate(y.test, fit.test)


#--------------------------------------------------------------
# Compare the previous results with theta-step: combined tuning

thetastep.out<-thetastep(x, y, opt.lambda= -12, lambda.theta = seq(-10,0,1),
 kernel.type='spline-t', fold = 10, cv = T)

# Output:
# The minimum of average 10 fold cross-validated hinge loss: 
# 0.6080333 
# The optimal lambda.theta on log2 scale: -3 
# The number of selected features out of 2 : 1 
# The average shrinkage factor: 0.4947156 

thetastep.out$opt.theta
# [1] 0.9894312 0.0000000

# visualize the classification boundaries.
msvm.plot(x, y, kernel, thetastep.out$model, thetastep.out$opt.theta)

fit.test <- predict.smsvm(x, x.test, kernel, thetastep.out$model,
                          thetastep.out$opt.theta)
# test error rate:
error.rate(y.test, fit.test)

# Output:
# > 0.48

#Again, this SMSVM one-step update with combined tuning can be done by 'smsvm'.

smsvm.out <- smsvm (x, y, kernel.type="spline-t", lambda = seq(-16,-8,1), 
lambda.theta = seq(-10,0,1), cv = T, fold = 10)

#-----------------#
# More iterations #
#-----------------#

# the second iteration

thetastep2.out <- thetastep(x, y, opt.lambda= -11, lambda.theta = seq(-10,0,1),
kernel.type='spline-t', fold = 10, isCombined = F, cv = T,
pretheta = thetastep.out$opt.theta)

# Output:
# The minimum of average 10 fold cross-validated hinge loss: 
# 0.6007739 
# The optimal lambda.theta on log2 scale: -6 
# The number of selected features out of 2 : 2 
# The average shrinkage factor: 0.5000019 

thetastep2.out$opt.theta
# [1] 1.000000e+00 3.737154e-06

cstep2.out <- cstep(x, y, lambda = seq(-16,-8,1), kernel.type = 'spline-t', 
theta = thetastep2.out$opt.theta, cv = T, fold = 10)

# Output:
# The minimum of 10 fold cross-validated hinge loss: 
# 0.6007746
# The optimal lambda is -11 on log2 scale.

# Little change is made in the second iteration.
#-----------------------------------------------------------------------


# Example 2: x1 and x2 both are relevant to y.

######################
#       MSVM         #
######################

# generate training data
set.seed(123)
temp <- twod.k4(40)
x <- as.matrix(temp$x)
y <- temp$y   

# scatter plot
plot(x, type="n", xlab=expression(x[1]), ylab=expression(x[2]),frame.plot=T)
for (j in 1:max(y))
  { points(x[(y == j),], pch=1, col=j+1)}

# > table(y)
#  y
#  1  2  3  4 
# 10 11  9 10


# MSVM with radial basis kernel 
# tune lambda using 10-fold CV (w.r.t. hinge loss)  
msvm.out <- msvm(x, y, lambda = seq(-10,-4,1), kernel.type = 'rbf',
                 fold=10, cv = T)

# Output:
# The minimum of average 10 fold cross-validated hinge loss: 
# 0.2184695 
# The optimal lambda on log2 scale: -7 

# plot the MSVM classification boundary
kernel <- list(type='rbf',par=1)
msvm.plot(x, y, kernel, msvm.out$model)


# MSVM with two-way interaction spline kernel 
# need to transform x so that x is between 0 and 1.

trans.x <- unit.scale(x)
msvm.out <- msvm(trans.x, y, lambda = seq(-18,-12,1), kernel.type ='spline-t2',
                 fold=10, cv = T)

# Output:
# The minimum of average 10 fold cross-validated hinge loss: 
# 0.1747517 
# The optimal lambda on log2 scale: -16 

# plot the classification boundary
kernel <- list(type='spline-t2',par=1)
msvm.plot(trans.x, y, kernel, msvm.out$model)


# c-step tuning plot
lambda <- seq(-18,-12,1)
plot(lambda, msvm.out$hinge, type="b", xlab=expression(log[2](lambda)), 
     ylab="cv.hinge")  

# theta-step: feature selection --------------------------
# sequential tuning

thetastep.out <- thetastep(trans.x, y, opt.lambda=-16, 
lambda.theta = seq(-10,0,1), kernel.type='spline-t2', fold = 10, 
isCombined=F, cv = T)

# Output:
# The minimum of average 10 fold cross-validated hinge loss: 
# 0.1747808 
# The optimal lambda.theta on log2 scale: -6 
# The number of selected features out of 3 : 3 
# The average shrinkage factor: 1 

# optimal rescaling parameters
thetastep.out$opt.theta

# Output:
# [1] 1 1 1


#------------------------------------ last updated by Y. Lee, Sep.10.05