k-Means

No preview image

1 collaborator

WHAT IS IT?

K-Means clustering or simple k-means is a method using vector quantization for cluster analysis. In machine learning or data mining a cluster is a group of objects sharing certain properties. The algorithm is used to partition a data set of size n into k clusters where every data point belongs to the cluster with the nearest mean. This mean or center point does not have to be in the input data set. K-Means clustering can be applied to all sorts of data as long as there exist a function for measuring the distance between to vectors. In a simple case k-means can be applied to two dimensional vectors i.e. points on a plane. The algorithm then computes k groups of points belonging to the same region. Of course this problem can not be solve easily (computationally wise) since the algorithm involve a repetitive computation of distance to all the center points for all data points.

HOW IT WORKS

In the map step the algorithm computes a list: For every point the nearest center is determine In the reduce step for every center all points belonging to it are used to compute the new coordinates of the center The center points are then updated using the new data.

HOW TO USE IT

Select how many groups you want to have by using the slider for k
Select the number of iterations
Load a data set by clicking load
Click on Init to randomly select k inital centers
Click on Run

THINGS TO NOTICE

The quality of the final solution depends on both the inital centers and the number of iterations which are performed.

THINGS TO TRY

Try to hit init several times to get better inital center points Try to perform k iterations again

EXTENDING THE MODEL

The implementation is not complete since it is only iterative and thus does not compute convergence of the kmeans algorithm.

NETLOGO FEATURES

This model demonstrates how the MapReduce extension can be used.

RELATED MODELS

Check out all other MapReduce models

Comments and Questions

Please start the discussion about this model! (You'll first need to log in.)

Click to Run Model

extensions [mapreduce]

breed [points point]
breed [centroids centroid]

globals [centers ]

to test
  ask points [die]
  ask centroids [die]
  load-points "input/points4.txt"
  init
  kmeans
end 

to load
  load-points word "input/" input
end 

to load-points [#filename]
  file-open #filename
  while[not file-at-end?][
    let line file-read-line
    ; show word "line " line
    if (length line) > 0 [
      let split-pos position " " line
      let px (substring line 0 split-pos)
      let py substring line (split-pos + 1) (length line)
      create-points 1 [
        setxy (read-from-string px) (read-from-string py)
        set shape "dot"
        set color white
      ]
    ]
  ]
  file-close
end 

to-report point-list []
  let #list []
  ; let id 0
  ask points [
   ; show xcor
   ; set id (id + 1)
   let cords list xcor ycor
   set #list (lput (list who (list cords)) #list)
  ]
  
  report #list
end 

to-report euclidean-distance [#p1x #p1y #p2x #p2y]
  report sqrt ((#p1x - #p2x)*(#p1x - #p2x) + (#p1y - #p2y)*(#p1y - #p2y))
end 

to-report vecsub [#p1 #p2]
  let p1x first #p1
  let p1y first bf #p1
  let p2x first #p2
  let p2y first bf #p2
  report list (p1x - (p1x - p2x) / 2) (p1y - (p1y - p2y) / 2)
end 

to-report newMeans [#key #acc #value]
  let val read-from-string #value
  ifelse #acc = []
  [
    ; search for the point in the list of centers
    let cents centers
    let id (read-from-string #key)
    while [(not empty? cents) and (first first cents) != id][set cents bf cents]
    ifelse not empty? cents [
      ;exract coordinates
      let centr first cents
      
      report vecsub (bf centr) val]
    ; there has been a mistake in the data. Keep accumulator
    [report #acc]
  ]
  [report vecsub #acc val]
end 

to-report nearestCenter [#key #value]
  let nearest -1
  let mindist 99999
  foreach centers [
    let id first ?
    let x first butfirst ?
    let y first butfirst butfirst ?
    let dist 0
    set dist (euclidean-distance (first #value) (first butfirst #value) x y)
    if dist < mindist [
     set mindist dist
     set nearest id
    ]
  ]
  
  report nearest
end 

to mapper [#key #value]
  ; Parse input
  let val read-from-string #value
  
  ; determine nearest center
  let nearest (nearestCenter (read-from-string #key) val)
  
  ; output
  mapreduce:emit nearest #value
end 

to centroids-to-centers
  set centers []
  ask centroids [
    set centers fput (list who pxcor pycor) centers
  ]
end 

to init
  ask centroids [die]
  ; initially guess centroids
  ask n-of k points [hatch-centroids 1[
        set shape "dot"
        set color 15 ]]
end 

to kmeans
  repeat iterations [
    ; create the list of centers
    centroids-to-centers
    
    let res mapreduce point-list
    
    ; adapt centroids
    foreach res [
      let id first ?
      let x (first last ?)
      let y (last last ?)
      ask centroid id [ setxy x y ]
    ]
    show "recentering done"
    color-partitions
  ]
end 

to color-partitions
  let cl []
  ask centroids [set cl fput who cl]
  set cl sort cl
  
  foreach point-list [
    ; show (first ?)
    ; show (first last ?)
    let nc nearestCenter (first ?) (first last ?)
    ; show "jjj"
    let cid nc
    let tcl cl
    let c 25
    while [(first tcl) != cid] [set c (c + 10) set tcl bf tcl]
    ask point (first ?) [set color c]
  ]
end 

to-report mapreduce [#input]
  reset-ticks
  
  let res mapreduce:mapreduce "mapper" "newMeans" [] #input
  
  while [mapreduce:running?] [
   every 0.5 [
       ; print mapreduce:map-progress
       ; print mapreduce:reduce-progress
       ; plot 1
       tick
     ]
  ]
  tick
  
  print "done"
  report mapreduce:result res
end

There is only one version of this model, created almost 11 years ago by Martin Dobiasch.

Attached files

File	Type	Description	Last updated
input.zip	data	Input data (sample)	almost 11 years ago, by Martin Dobiasch	Download
pointgen.py	data	Simple Script for generating Input-Data	almost 11 years ago, by Martin Dobiasch	Download

This model does not have any ancestors.

This model does not have any descendants.

NetLogo