K-nearest neighbours algorithm in C

K

In a previous post we saw the differences between K-means and K-NN. Here is step by step on how to compute K-nearest neighbors KNN algorithm.

  1. Determine parameter K = number of nearest neighbors
  2. Calculate the distance between the query-instance and all the training samples
  3. Sort the distance and determine nearest neighbors based on the K-th minimum distance
  4. Gather the category of the nearest neighbors
  5. Use simple majority of the category of nearest neighbors as the prediction value of the query instance

Github Repository : https://github.com/devcoons/simple-knn

[ ERROR LOADING SOURCE CODE FROM GITLAB. ]
[ First Version available from cache memory. ]

#define _CRT_SECURE_NO_WARNINGS

/*	---	---	---	---	---	---	---	---	---	---	 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <math.h>

/*	---	---	---	---	---	---	---	---	---	---	 */

#define DATATYPE double

/*	---	---	---	---	---	---	---	---	---	---	 */

struct sample
{
	DATATYPE * dim;
	uint32_t group;
	DATATYPE tmp_distance;
};

struct knn_data
{
	uint32_t k;
	struct sample ** best_voters;
	struct sample * samples[2];
	uint32_t samples_count[2];
	uint32_t samples_dimensions[2];
};

/*	---	---	---	---	---	---	---	---	---	---	 */

void parse_string_to_sample(struct sample *, char *, uint32_t, uint8_t);

void parse_file_to_samples(struct knn_data *, char *);

void parse_samples_to_file(struct knn_data *, char *);

void knn_algorithm(struct knn_data *);

/*	---	---	---	---	---	---	---	---	---	---	 */

int main()
{
	char tmp_str[256];

	struct knn_data knn;

	printf("Set number of voters (k=):");
	
	scanf("%d", &knn.k);

	knn.best_voters = (struct samples **) malloc(knn.k * sizeof(struct samples *));
	
	printf("Provide categorized samples file (train data):");
	
	scanf("%s", tmp_str);

	parse_file_to_samples(&knn, tmp_str);

	printf("Provide uncategorized samples file (new data):");
	
	scanf("%s", tmp_str);

	parse_file_to_samples(&knn, tmp_str);

	printf("Perform k-nn algorithm\n");
		
	knn_algorithm(&knn);
	
	printf("Do you want to save the output (yes/no)?");
	
	scanf("%s", tmp_str);

	if (strcmp(tmp_str, "yes") == 0 || strcmp(tmp_str, "y") == 0)
	{
		printf("Where do you want to save the newly categorized data (filepath)?");

		scanf("%s", tmp_str);

		parse_samples_to_file(&knn, tmp_str);

		printf("Completed.\n");
	}

	printf("Training Samples:\n");

	for (int i = 0; i < knn.samples_count[0]; i++)
	{
		printf("Sample %d -",i);

		for (int j = 0; j < knn.samples_dimensions[0]; j++)

			printf(" %f", (knn.samples[0] + i)->dim[j]);

		printf(" | %d\n",(knn.samples[0]+i)->group);
	}

	printf("New Categorized Samples:\n");

	for (int i = 0; i < knn.samples_count[1]; i++)
	{
		printf("Sample %d -",i);

		for (int j = 0; j < knn.samples_dimensions[1]; j++)

			printf(" %f", (knn.samples[1] + i)->dim[j]);

		printf(" | %d\n", (knn.samples[1] + i)->group);
	}

	return 0;
}

/*	---	---	---	---	---	---	---	---	---	---	 */

void parse_string_to_sample(struct sample * sample, char * string, uint32_t max_dimensions, 
                            uint8_t has_group)
{
	int tmp_cnt = has_group == 0 ? 0 : 1;

	char * tmp_ptr = strtok(string, ",");

	if(has_group == 0)

		sample->group = atoi(tmp_ptr);

	else

		sample->dim[0] = atof(tmp_ptr);

	while ((tmp_ptr = strtok(NULL, ",")) != NULL)

		sample->dim[tmp_cnt++] = atof(tmp_ptr);
}

/*	---	---	---	---	---	---	---	---	---	---	 */

void parse_file_to_samples(struct knn_data * knn, char * filepath)
{
	int tmp_cnt;

	char line[256];

	FILE *file_pointer;

	file_pointer = fopen(filepath, "r");

	fgets(line, 128, file_pointer);

	tmp_cnt = strstr(line, "uncategorized") == NULL ? 0 : 1;

	fgets(line, 128, file_pointer);

	knn->samples_count[tmp_cnt] = atoi(line);

	knn->samples[tmp_cnt] = 
        (struct sample *) malloc(knn->samples_count[tmp_cnt] * sizeof(struct sample));

	fgets(line, 128, file_pointer);

	knn->samples_dimensions[tmp_cnt] = atoi(line);

	for (uint32_t i = 0; i < knn->samples_count[tmp_cnt]; i++)
	{
		(knn->samples[tmp_cnt]+i)->dim = 
                (DATATYPE *)malloc(knn->samples_dimensions[tmp_cnt] * sizeof(DATATYPE));

		fgets(line, 128, file_pointer);

		parse_string_to_sample(knn->samples[tmp_cnt]+i, line, 
                                       knn->samples_dimensions[tmp_cnt], tmp_cnt);
	}
}

/*	---	---	---	---	---	---	---	---	---	---	 */

void parse_samples_to_file(struct knn_data * knn, char * filepath)
{
	printf(":) not working yet...");
}

/*	---	---	---	---	---	---	---	---	---	---	 */


void knn_algorithms_sort_asc_voters(struct knn_data * knn)
{
	struct sample * tmp_smpl = NULL;

	for (int i = 0; i < knn->k; i++)
	{
		for (int j = 0; j < knn->k - 1; j++)
		{
		   if (knn->best_voters[j]->tmp_distance > knn->best_voters[j + 1]->tmp_distance)
		   {
		      tmp_smpl = knn->best_voters[j];

		      knn->best_voters[j] = knn->best_voters[j + 1];

		      knn->best_voters[j + 1] = tmp_smpl;
		   }
		}
	}
}

void knn_algorithm(struct knn_data * knn)
{
	double euclidean_distance;
	
	uint32_t * most_common[2],selected_group_pos;

	most_common[0] = (uint32_t *)malloc(knn->k * sizeof(uint32_t));

	most_common[1] = (uint32_t *)malloc(knn->k * sizeof(uint32_t));
	
	for (int i = 0; i < knn->samples_count[1]; i++)
	{
		for (int q = 0; q < knn->k; q++)

			knn->best_voters[q] = NULL;

		for (int j = 0; j < knn->samples_count[0]; j++)
		{
		   euclidean_distance = 0;

		   for (int q = 0; q < knn->samples_dimensions[0]; q++)
		      euclidean_distance += pow(
                               (knn->samples[0] + j)->dim[q] - (knn->samples[1] + i)->dim[q]
                                , 2);

		   (knn->samples[0]+j)->tmp_distance = sqrt(euclidean_distance);

		   if (j < knn->k)
		   {
		      knn->best_voters[j] = (knn->samples[0] + j);
		   }
		   else
		   {
		      if(j == knn->k)

			 knn_algorithms_sort_asc_voters(knn);

		       for (int q = 0; q < knn->k; q++)

		         if (knn->best_voters[q]->tmp_distance > (knn->samples[0] + j)->tmp_distance)
			 {
			    for (int z = knn->k - 1; z >= q+1 ; z--)

			       knn->best_voters[z] = knn->best_voters[z - 1];
			     
                             knn->best_voters[q] = (knn->samples[0] + j);

			     break;
                          }
			}
		}

		memset(most_common[0], 0, knn->k * sizeof(uint32_t));
		
		memset(most_common[1], 0, knn->k * sizeof(uint32_t));

		for (int j = 0; j < knn->k; j++)
		{
			for(int q = 0 ; q < knn->k ; q++)

				if (*(most_common[0] + q) == knn->best_voters[j]->group)
				{
					*(most_common[1] + q) += 1;

					break;
				}
				else if (*(most_common[0] + q) == 0)
				{
					*(most_common[0] + q) = knn->best_voters[j]->group;

					*(most_common[1] + q) += 1;
					break;
				}
		}

		selected_group_pos = 0;

		for (int j = 1; j < knn->k; j++)
		{
			if (*(most_common[1] + j) == 0)
				break;

			if (*(most_common[0] + j) > *(most_common[0] + selected_group_pos))
				selected_group_pos = j;
		}

		(knn->samples[1] + i)->group = *(most_common[0] + selected_group_pos);
	}
}

/*	---	---	---	---	---	---	---     ---	---	---	 */

Training Samples

categorized
6
2
0,1,1
0,2,1
0,2,2
1,7,5
1,7,6
1,8,5

New Samples to categorize

uncategorized
5
2
9,6
1,2
3,1
6,6
3,3
Disclaimer: The present content may not be used for training artificial intelligence or machine learning algorithms. All other uses, including search, entertainment, and commercial use, are permitted.

Categories

Tags