clear
close all
clc

%%
help kmeans

%% create data
n = 100; % number of points in each cluster

% generate two normal distributions 
X = [randn(n,2)*0.5 + 1; randn(n,2)*0.5 - 1; randn(n,2)*0.5 + repmat([-2 2],n,1)];
k = 3;  % number of clusters

% generate a normal and a surounding circle
% X1 = randn(n,2)*0.1;  % normal
% X2 = rand(n,2)-0.5; X2 = X2./repmat(sqrt(sum(X2.^2,2)),1,2); % unit circle
% X = [X1; X2]; % combine
% k = 2;

figure; gcplot(X);

%% perform kmeans clustering

[cidx, ctrs] = kmeans(X, k); % kmeans used in simplest way

% kmeans with 5 restarts
%[cidx, ctrs] = kmeans(X, k, 'Replicates', 5);

% even more options
%[cidx, ctrs] = kmeans(X, 2, 'Distance','sqEuclidean', ...
%                            'Start', 'uniform',...
%                            'EmptyAction', 'singleton',...
%                            'Replicates', 5, 
%                            'Options', statset('Display','final'));

% display the clusters
figure; gcplot(X, cidx);

% display both the clusters and their centers
% figure; gcplot([X; ctrs], [cidx; repmat(max(cidx)+1, k, 1)]);

%% select number of clusters when not given

scatter = zeros(1,2*k); % vector of total scatter
scatter(1) = sum(sum((X - repmat(mean(X,1),size(X,1),1)).^2,2)); % j = 1
for j = 2 : 2*k
    [~, ~, intravars] = kmeans(X, j, 'Replicates', 10);
    scatter(j) = sum(intravars);
end

figure; 
plot(scatter, 'o--', 'markersize', 14, 'linewidth', 2)
xlabel 'number of clusters'
ylabel 'scatter'
grid on

%% load digits data
clear
close all
clc

load MNIST_data_train % store this mat file in the same folder as this script

Digits1 = trainImages(:, trainLabels==1); Digits1 = Digits1(:,1:500)'; % 500 images of 1
Digits2 = trainImages(:, trainLabels==2); Digits2 = Digits2(:,1:500)'; % 500 images of 2

X = [Digits1; Digits2]; % combine
trueLabels = [ones(1,500), 2*ones(1,500)]';

figure; gcplot(X,trueLabels); legend('1','2') % display data

%% apply kmeans to cluster the above digits data
kmeans_labels = kmeans(X, 2, 'Replicates', 10);

figure; gcplot(X, kmeans_labels)

% compute clustering accuracy rate
accuracy = max( sum(trueLabels==kmeans_labels), sum(trueLabels==3-kmeans_labels)) / size(X,1)

accuracy1 = 1 - computing_percentage_of_misclassified_points(kmeans_labels,trueLabels)

