library(class) names <- c("os", # 1. Operating system "topos", # 2. Generic operating system "ipnum", # 3. IP ID number "npackets", # 4. Number of packets seen "meanttl", # 5. Mean TTL "meantos", # 6. Mean type of service "ntos", # 7. Number distinct type of service "meanlog2win", # 8. mean(log(window size,2)) "selog2win", # 9. sd(log(window size,2)) "nnwin", # 10. Number distinct window sizes "meandf", # 11. mean(don't fragment flag set) "meanincripid", # 12. mean(diff(ipid)) "stdincripid", # 13. sd(diff(ipid)) "meanlog2seq", # 14. mean(log(sequence number,2)) "rangelog2seq", # 15. range(log(sequence number,2)) "meanlog2incrseq", # 16. mean(log(diff(sequence number),2)) "stdlog2incrseq", # 17. sd(log(diff(sequence number),2)) "meanlog2sport", # 18. mean(log(source port,2)) "maxlog2sport", # 19. max(log(source port,2)) "minlog2sport", # 20. min(log(source port,2)) "meanincrsport", # 21. mean(diff(source port)) "stdincrsport", # 22. sd(diff(source port)) "meannops", # 23. mean(number options) "modeMSS", # 24. mode maximum segment size "ndiff_MSS", # 25. number distinct maximum segment size "ntimepstamps", # 26. number timestamps "nwscale", # 27. number window scale "nsackok", # 28. number sack ok "nendtcp", # 29. number end tcp "meaniptotlen", # 30. mean(total length) "stdiptotlen") # 31. sd(total length) osnames <- c("dos","irix","linux","apple","mac","solaris","windows") a <- matrix(scan("ptrain.txt",comment.char="#"),byrow=T,ncol=31) b <- matrix(scan("ptest.txt",comment.char="#"),byrow=T,ncol=31) data <- rbind(a,b) cls <- unique(data[,2]) nums <- rep(0,length(cls)) for(i in 1:length(cls)){ nums[i] <- sum(data[,2]==cls[i]) } training.data <- NULL testing.data <- NULL for(i in 1:length(cls)){ x <- 1:nums[i] y <- seq(1,nums[i],by=2) z <- setdiff(x,y) q <- data[data[,2]==cls[i],] training.data <- rbind(training.data,q[y,]) testing.data <- rbind(testing.data,q[z,]) } variates <- c(5:24,26:30) training.data2 <- training.data[,variates] testing.data2 <- testing.data[,variates] training.cls <- training.data[,2] testing.cls <- testing.data[,2] nam <- names[variates] confusion.matrix <- function (truth, class) { n <- length(truth) if (n != length(class)) { stop("\nTruth and class must be of the same length") } nc <- max(truth) + 1 conf <- matrix(0, ncol = nc, nrow = nc) for (i in 1:n) { if (class[i] >= 0) { conf[1 + truth[i], 1 + class[i]] <- conf[1 + truth[i], 1 + class[i]] + 1 } } conf } mn <- apply(training.data2,2,min) mx <- apply(training.data2,2,max) td <- t((t(training.data2)-mn)/(mx-mn)) test1 <- function() { require(class) require(utilities) tc <- as.factor(training.cls) ttd <- t((t(testing.data2)-mn)/(mx-mn)) ttc <- as.factor(testing.cls) k <- knn(td,ttd,tc,k=3) sum(k!=ttc) con <- confusion.matrix(testing.cls,as.integer(as.vector(k))) rownames(con) <- osnames colnames(con) <- osnames con } test2 <- function() { require(class) require(utilities) x <- training.cls x[x==6] <- 0 x[x==4] <- 3 x[x==5] <- 4 newosnames <- c("windows","irix","linux","mac","solaris") tc <- as.factor(x) ttd <- t((t(testing.data2)-mn)/(mx-mn)) y <- testing.cls y[y==6] <- 0 y[y==4] <- 3 y[y==5] <- 4 ttc <- as.factor(y) k <- knn(td,ttd,tc,k=3) sum(k!=ttc) con <- confusion.matrix(y,as.integer(as.vector(k))) rownames(con) <- newosnames colnames(con) <- newosnames con } # dos irix linux apple mac solaris windows #dos 0 0 0 0 2 0 32 #irix 0 16 0 0 0 0 1 #linux 0 0 25 0 0 0 0 #apple 0 0 0 0 3 0 3 #mac 0 0 0 0 31 0 0 #solaris 0 0 0 0 0 27 0 #windows 1 0 6 1 3 0 1752 kf <- NULL training.clsf <- NULL testing.clsf <- NULL k.crude <- as.integer(as.vector(k)) for(i in 0:6){ z <- ttd[k.crude==i,] if(is.vector(z)) z <- matrix(z,nrow=1) testing.clsf <- c(testing.clsf,testing.data[k.crude==i,1]) w <- td[training.cls==i,] k <- knn(w,z,as.factor(training.data[training.cls==i,1])) kf <- c(kf,as.integer(as.vector(k))) } con2 <- confusion.matrix(testing.clsf,kf) write(t(con2),file="confusion2",ncol=30)