In a previous post we saw the differences between K-means and K-NN. Here is step by step on how to compute K-nearest neighbors KNN algorithm.
- Determine parameter K = number of nearest neighbors
- Calculate the distance between the query-instance and all the training samples
- Sort the distance and determine nearest neighbors based on the K-th minimum distance
- Gather the category of the nearest neighbors
- Use simple majority of the category of nearest neighbors as the prediction value of the query instance
Github Repository : https://github.com/devcoons/simple-knn
[ ERROR LOADING SOURCE CODE FROM GITLAB. ] [ First Version available from cache memory. ] #define _CRT_SECURE_NO_WARNINGS /* --- --- --- --- --- --- --- --- --- --- */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <inttypes.h> #include <math.h> /* --- --- --- --- --- --- --- --- --- --- */ #define DATATYPE double /* --- --- --- --- --- --- --- --- --- --- */ struct sample { DATATYPE * dim; uint32_t group; DATATYPE tmp_distance; }; struct knn_data { uint32_t k; struct sample ** best_voters; struct sample * samples[2]; uint32_t samples_count[2]; uint32_t samples_dimensions[2]; }; /* --- --- --- --- --- --- --- --- --- --- */ void parse_string_to_sample(struct sample *, char *, uint32_t, uint8_t); void parse_file_to_samples(struct knn_data *, char *); void parse_samples_to_file(struct knn_data *, char *); void knn_algorithm(struct knn_data *); /* --- --- --- --- --- --- --- --- --- --- */ int main() { char tmp_str[256]; struct knn_data knn; printf("Set number of voters (k=):"); scanf("%d", &knn.k); knn.best_voters = (struct samples **) malloc(knn.k * sizeof(struct samples *)); printf("Provide categorized samples file (train data):"); scanf("%s", tmp_str); parse_file_to_samples(&knn, tmp_str); printf("Provide uncategorized samples file (new data):"); scanf("%s", tmp_str); parse_file_to_samples(&knn, tmp_str); printf("Perform k-nn algorithm\n"); knn_algorithm(&knn); printf("Do you want to save the output (yes/no)?"); scanf("%s", tmp_str); if (strcmp(tmp_str, "yes") == 0 || strcmp(tmp_str, "y") == 0) { printf("Where do you want to save the newly categorized data (filepath)?"); scanf("%s", tmp_str); parse_samples_to_file(&knn, tmp_str); printf("Completed.\n"); } printf("Training Samples:\n"); for (int i = 0; i < knn.samples_count[0]; i++) { printf("Sample %d -",i); for (int j = 0; j < knn.samples_dimensions[0]; j++) printf(" %f", (knn.samples[0] + i)->dim[j]); printf(" | %d\n",(knn.samples[0]+i)->group); } printf("New Categorized Samples:\n"); for (int i = 0; i < knn.samples_count[1]; i++) { printf("Sample %d -",i); for (int j = 0; j < knn.samples_dimensions[1]; j++) printf(" %f", (knn.samples[1] + i)->dim[j]); printf(" | %d\n", (knn.samples[1] + i)->group); } return 0; } /* --- --- --- --- --- --- --- --- --- --- */ void parse_string_to_sample(struct sample * sample, char * string, uint32_t max_dimensions, uint8_t has_group) { int tmp_cnt = has_group == 0 ? 0 : 1; char * tmp_ptr = strtok(string, ","); if(has_group == 0) sample->group = atoi(tmp_ptr); else sample->dim[0] = atof(tmp_ptr); while ((tmp_ptr = strtok(NULL, ",")) != NULL) sample->dim[tmp_cnt++] = atof(tmp_ptr); } /* --- --- --- --- --- --- --- --- --- --- */ void parse_file_to_samples(struct knn_data * knn, char * filepath) { int tmp_cnt; char line[256]; FILE *file_pointer; file_pointer = fopen(filepath, "r"); fgets(line, 128, file_pointer); tmp_cnt = strstr(line, "uncategorized") == NULL ? 0 : 1; fgets(line, 128, file_pointer); knn->samples_count[tmp_cnt] = atoi(line); knn->samples[tmp_cnt] = (struct sample *) malloc(knn->samples_count[tmp_cnt] * sizeof(struct sample)); fgets(line, 128, file_pointer); knn->samples_dimensions[tmp_cnt] = atoi(line); for (uint32_t i = 0; i < knn->samples_count[tmp_cnt]; i++) { (knn->samples[tmp_cnt]+i)->dim = (DATATYPE *)malloc(knn->samples_dimensions[tmp_cnt] * sizeof(DATATYPE)); fgets(line, 128, file_pointer); parse_string_to_sample(knn->samples[tmp_cnt]+i, line, knn->samples_dimensions[tmp_cnt], tmp_cnt); } } /* --- --- --- --- --- --- --- --- --- --- */ void parse_samples_to_file(struct knn_data * knn, char * filepath) { printf(":) not working yet..."); } /* --- --- --- --- --- --- --- --- --- --- */ void knn_algorithms_sort_asc_voters(struct knn_data * knn) { struct sample * tmp_smpl = NULL; for (int i = 0; i < knn->k; i++) { for (int j = 0; j < knn->k - 1; j++) { if (knn->best_voters[j]->tmp_distance > knn->best_voters[j + 1]->tmp_distance) { tmp_smpl = knn->best_voters[j]; knn->best_voters[j] = knn->best_voters[j + 1]; knn->best_voters[j + 1] = tmp_smpl; } } } } void knn_algorithm(struct knn_data * knn) { double euclidean_distance; uint32_t * most_common[2],selected_group_pos; most_common[0] = (uint32_t *)malloc(knn->k * sizeof(uint32_t)); most_common[1] = (uint32_t *)malloc(knn->k * sizeof(uint32_t)); for (int i = 0; i < knn->samples_count[1]; i++) { for (int q = 0; q < knn->k; q++) knn->best_voters[q] = NULL; for (int j = 0; j < knn->samples_count[0]; j++) { euclidean_distance = 0; for (int q = 0; q < knn->samples_dimensions[0]; q++) euclidean_distance += pow( (knn->samples[0] + j)->dim[q] - (knn->samples[1] + i)->dim[q] , 2); (knn->samples[0]+j)->tmp_distance = sqrt(euclidean_distance); if (j < knn->k) { knn->best_voters[j] = (knn->samples[0] + j); } else { if(j == knn->k) knn_algorithms_sort_asc_voters(knn); for (int q = 0; q < knn->k; q++) if (knn->best_voters[q]->tmp_distance > (knn->samples[0] + j)->tmp_distance) { for (int z = knn->k - 1; z >= q+1 ; z--) knn->best_voters[z] = knn->best_voters[z - 1]; knn->best_voters[q] = (knn->samples[0] + j); break; } } } memset(most_common[0], 0, knn->k * sizeof(uint32_t)); memset(most_common[1], 0, knn->k * sizeof(uint32_t)); for (int j = 0; j < knn->k; j++) { for(int q = 0 ; q < knn->k ; q++) if (*(most_common[0] + q) == knn->best_voters[j]->group) { *(most_common[1] + q) += 1; break; } else if (*(most_common[0] + q) == 0) { *(most_common[0] + q) = knn->best_voters[j]->group; *(most_common[1] + q) += 1; break; } } selected_group_pos = 0; for (int j = 1; j < knn->k; j++) { if (*(most_common[1] + j) == 0) break; if (*(most_common[0] + j) > *(most_common[0] + selected_group_pos)) selected_group_pos = j; } (knn->samples[1] + i)->group = *(most_common[0] + selected_group_pos); } } /* --- --- --- --- --- --- --- --- --- --- */
Training Samples
categorized 6 2 0,1,1 0,2,1 0,2,2 1,7,5 1,7,6 1,8,5
New Samples to categorize
uncategorized 5 2 9,6 1,2 3,1 6,6 3,3