Source code for geoanalytics.clustering.KMeansPP

# KMeans++ enhances KMeans by improving initialization for better convergence and cluster quality, with added utilities for memory tracking, runtime reporting, and Elbow Method plotting on 2D-location DataFrame inputs.
#
# **Importing and Using this KMeans++ Wrapper in a Python Program**
#
#             import pandas as pd
#
#             from goeAnalytics.clustering import KMeansPP
#
#             df = pd.read_csv('data.csv')
#
#             obj = KMeansPP(df)
#
#             obj.elbowMethod()
#
#             output = obj.clustering(k=3)
#
#             labelsDF = output[0]
#
#             clusterCenters = output[1]
#
#             obj.save(outputFile='KMeansPPLabels.csv')
#

__copyright__ = """
Copyright (C)  2022 Rage Uday Kiran

     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
     the Free Software Foundation, either version 3 of the License, or
     (at your option) any later version.

     This program is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.

     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""

import time
import psutil
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.cluster import KMeans as kmeansAlg
import pandas as pd


[docs] class KMeansPP: """ **About this algorithm** :**Description**:KMeans++ improves K-Means clustering by using smarter centroid initialization for better stability and faster convergence, applied here to high-dimensional data excluding x, y coordinates. :**Parameters**: - **dataframe** (*pd.DataFrame*) -- A Pandas DataFrame that contains the input dataset. - The first two columns must be spatial or positional features (e.g., 'x' and 'y'). - All other columns are treated as feature vectors for clustering. :**Attributes**: - **df** (*pd.DataFrame*) -- Stores the copy of the input dataset, renaming first two columns to 'x' and 'y'. - **start_time** (*float*) -- Records the clustering start time for runtime analysis. - **memory_uss_kb** (*float*) -- Measures USS memory usage in kilobytes after execution. - **memory_rss_kb** (*float*) -- Measures RSS memory usage in kilobytes after execution. - **labels** (*pd.DataFrame*) -- Final dataframe containing 'x', 'y', and cluster label for each instance. - **cluster_centers_** (*np.ndarray*) -- Coordinates of the final cluster centroids after fitting. **Execution methods** **Calling from a Python program** .. code-block:: python import pandas as pd from goeAnalytics.clustering import KMeansPP df = pd.read_csv('data.csv') obj = KMeansPP(df) obj.elbowMethod() output = obj.clustering(k=3) labelsDF = output[0] clusterCenters = output[1] obj.save(outputFile='KMeansPPLabels.csv') **Credits** The complete program was written by Raashika and revised by M.Charan Teja under the supervision of Professor Rage Uday Kiran. """ def __init__(self, dataframe): """ Constructor to initialize the KMeans++ object with the given dataframe. """ self.df = dataframe.copy() self.df.columns = ['x', 'y'] + list(self.df.columns[2:]) self.labelsDF = None self.centers = None self.startTime = None self.endTime = None self.memoryUSS = None self.memoryRSS = None
[docs] def getRuntime(self): """ Prints the total runtime of the clustering algorithm. """ print("Total Execution time of proposed Algorithm:", self.endTime - self.startTime, "seconds")
[docs] def getMemoryUSS(self): """ Prints the memory usage (USS) of the process in kilobytes. """ print("Memory (USS) of proposed Algorithm in KB:", self.memoryUSS)
[docs] def getMemoryRSS(self): """ Prints the memory usage (RSS) of the process in kilobytes. """ print("Memory (RSS) of proposed Algorithm in KB:", self.memoryRSS)
[docs] def elbowMethod(self): """ Applies the elbow method to help decide the optimal number of clusters (k). It plots WCSS (within-cluster sum of squares) for k in range 1 to 10. """ wcss = [] k_values = range(1, 11) data = self.df.drop(['x', 'y'], axis=1) for k in tqdm(k_values): kmeans = kmeansAlg(n_clusters=k, random_state=42, n_init=10) kmeans.fit(data) wcss.append(kmeans.inertia_) plt.plot(k_values, wcss, marker='o', linestyle='--') plt.xlabel('Number of Clusters (k)') plt.ylabel('WCSS') plt.title('Elbow Method for Optimal k (Ignoring Location Columns)') plt.show()
[docs] def run(self, k = 4, max_iter=300): """ Runs KMeans++ clustering on the input dataset using scikit-learn. :param k: Number of clusters to form. :param max_iter: Maximum number of iterations for a single run. :return: A DataFrame with original x, y and cluster labels, and the cluster centers. """ self.startTime = time.time() data = self.df.drop(['x', 'y'], axis=1) data = data.to_numpy() kmeans = kmeansAlg(n_clusters=k, max_iter=max_iter, init='k-means++').fit(data) label = self.df[['x', 'y']] self.labelsDF = label.assign(labels=kmeans.labels_) self.centers = kmeans.cluster_centers_ self.endTime = time.time() process = psutil.Process() self.memoryUSS = process.memory_full_info().uss / 1024 self.memoryRSS = process.memory_full_info().rss / 1024 return self.labelsDF, self.centers
[docs] def save(self, outputFileLabels='KMeansPPLabels.csv', outputFileCenters='KMeansPPCenters.csv'): if self.labelsDF is not None: try: self.labelsDF.to_csv(outputFileLabels, index=False) print(f"Labels saved to: {outputFileLabels}") except Exception as e: print(f"Failed to save labels: {e}") else: print("No labels to save. Please execute run() method first.") if self.centers is not None: try: pd.DataFrame(self.centers).to_csv(outputFileCenters, index=False) print(f"Cluster centers saved to: {outputFileCenters}") except Exception as e: print(f"Failed to save cluster centers: {e}") else: print("No cluster centers to save. Please execute run() method first.")