K Means Clustering

K means Clustering is one of the simplest and most commonly used unsupervised clustering algorithms around.

The general approach is as follows:
  • Choose k centroids randomly.
  • Calculate the distance from each point in the dataset to be classified to each centroid.
  • Assign each point to the nearest centroid.
  • Calculate the centroids of the resulting clusters.
  • Repeat until the centroids don't move too much.
Here is some R code which  generates a data set and implements the algorithm. Click here to see the animation.
###########################################
# R code to implement k means classification
##########################################
#  NB - to make the animation - make sure you have ImageMajick installed from http://www.imagemagick.org/
##########################################
 
# initiate libraries
library(animation)
 
# set working directory
setwd('C:/Users/RF186004/Desktop')
 
 
#########################################
# Define some functions to be used later
#########################################
 
make_animation <- function(){
  #given x, this function finds the y values to create a circular cluster
  find_y<- function(x) runif(1, -(sqrt((radius)^2-(x-cent_x)^2)+cent_y)+2*cent_y, sqrt((radius)^2-(x-cent_x)^2)+cent_y)
 
  #finds the distances from each point to each centroid
  get_distances<- function(x) sqrt((x[1] - centroids[1:k,1])^2 + (x[2] - centroids[1:k,2])^2)
 
  #finds the centroids of a the clusters
  find_centroids<- function(i) c(mean(data[current_cluster == i,1]), mean(data[current_cluster == i,2]))
 
  #finds how far the centroids have moved
  find_delta <- function(i) sqrt((new_centroids[i, 1]-centroids[i, 1])^2+(new_centroids[i, 2]-centroids[i, 2])^2)
 
  #plots the centroids on the graph
  plot_centroids <- function(i) points(new_centroids[i,1], new_centroids[i,2], pch = 16, cex=1.5, col = "red")
 
  #########################################
  # set the parameters
  #########################################
  span <- 3       # see belwo for definiion
  radius = 4      # of the circles of data to generate 
  num_in_group = 500
  k <- 4          # specify the number of clusters to identify
 
  ##########################################
  # generate the data to be clustered. It comprises four circular groups of num_in_group points, centered on (span, span), (-span, span), (-span, -span), (-span, span) with a radius of radius
  ##########################################
  #make the 1st groups of data
  cent_x<- span
  cent_y<- span
  x = runif(num_in_group, cent_x-radius, cent_x+radius)
  y = apply(as.matrix(x), 1, find_y)
  g1 <- cbind(x, y, group = rep(1, num_in_group))
 
  #make the 2nd groups of data
  cent_x<- -span
  cent_y<- span
  x = runif(num_in_group, cent_x-radius, cent_x+radius)
  y = apply(as.matrix(x), 1, find_y)
  g2 <- cbind(x, y, rep(2, num_in_group))
 
  #make the 3rd groups of data
  cent_x<- -span
  cent_y<- -span
  x = runif(num_in_group, cent_x-radius, cent_x+radius)
  y = apply(as.matrix(x), 1, find_y)
  g3 <- cbind(x, y, rep(3, num_in_group))
 
  #make the 4th groups of data
  cent_x<- span
  cent_y<- -span
  x = runif(num_in_group, cent_x-radius, cent_x+radius)
  y = apply(as.matrix(x), 1, find_y)
  g4 <- cbind(x, y, rep(4, num_in_group))
 
  data <- rbind(g1, g2, g3, g4)
 
  ##########################################
  # do the clustering
  ##########################################
  #randomly select 4 centroids
  centroids_indicies <- sample(c(1:length(data[,1])), k, replace = FALSE)
  centroids <- data[centroids_indicies,1:2]
  delta_avg <- 10
  num_interations  <- 0
 
  while(delta_avg > 0.01 && num_interations < 50){
 
    #find the distance between each point and each of the 4 centroids
    distance <- t(apply(as.matrix(data), 1, get_distances))
 
    # assign each point to a cluster
    current_cluster <- apply(as.matrix(distance), 1, which.min)
 
    # find the new centroids - a k by 2 matrix
    new_centroids <- t(apply(as.matrix(c(1:k)), 1, find_centroids))
 
    #plot the data and the centroids 
    plot(data[,1], data[,2], col = current_cluster, pch = 3, cex=0.5, xlab="x", ylab="y", main="K Means Clustering")
    apply(as.matrix(c(1:k)), 1, plot_centroids)
 
    #find how much each centroid moved
    delta <- t(apply(as.matrix(c(1:k)), 1, find_delta))
    delta_avg = mean(delta)
 
    centroids <- new_centroids
 
    num_interations <- num_interations+1
  }
}
saveMovie(make_animation(),interval = 0.01, width = 580, height = 400)
paste("number of iterations =", num_interations)
paste("Last avg delta =", delta_avg)
Created by Pretty R at inside-R.org

Comments

Popular posts from this blog

Why Study Shakespeare?

Can Data Mining Algorithms Extract Value from your Personal Data (and should you get a piece of the action?)