Source code for geoanalytics.clustering.KMeansPP

# KMeans++ enhances KMeans by improving initialization for better convergence and cluster quality, with added utilities for memory tracking, runtime reporting, and Elbow Method plotting on 2D-location DataFrame inputs.
#
# **Importing and Using this KMeans++ Wrapper in a Python Program**
#
#             import pandas as pd
#
#             from goeAnalytics.clustering import KMeansPP
#
#             df = pd.read_csv('data.csv')
#
#             obj = KMeansPP(df)
#
#             obj.elbowMethod()
#
#             output = obj.clustering(k=3)
#
#             labelsDF = output[0]
#
#             clusterCenters = output[1]
#
#             obj.save(outputFile='KMeansPPLabels.csv')
#

__copyright__ = """
Copyright (C)  2022 Rage Uday Kiran

     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
     the Free Software Foundation, either version 3 of the License, or
     (at your option) any later version.

     This program is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.

     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""

import time
import psutil
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.cluster import KMeans as kmeansAlg
import pandas as pd



[docs]
class KMeansPP:
    """
        **About this algorithm**

        :**Description**:KMeans++ improves K-Means clustering by using smarter centroid initialization for better stability and faster convergence, applied here to high-dimensional data excluding x, y coordinates.

        :**Parameters**:    - **dataframe** (*pd.DataFrame*) -- A Pandas DataFrame that contains the input dataset.
                            - The first two columns must be spatial or positional features (e.g., 'x' and 'y').
                            - All other columns are treated as feature vectors for clustering.

        :**Attributes**:    - **df** (*pd.DataFrame*) -- Stores the copy of the input dataset, renaming first two columns to 'x' and 'y'.
                            - **start_time** (*float*) -- Records the clustering start time for runtime analysis.
                            - **memory_uss_kb** (*float*) -- Measures USS memory usage in kilobytes after execution.
                            - **memory_rss_kb** (*float*) -- Measures RSS memory usage in kilobytes after execution.
                            - **labels** (*pd.DataFrame*) -- Final dataframe containing 'x', 'y', and cluster label for each instance.
                            - **cluster_centers_** (*np.ndarray*) -- Coordinates of the final cluster centroids after fitting.

        **Execution methods**

        **Calling from a Python program**

        .. code-block:: python

                import pandas as pd

                from goeAnalytics.clustering import KMeansPP

                df = pd.read_csv('data.csv')

                obj = KMeansPP(df)

                obj.elbowMethod()

                output = obj.clustering(k=3)

                labelsDF = output[0]

                clusterCenters = output[1]

                obj.save(outputFile='KMeansPPLabels.csv')

        **Credits**

        The complete program was written by Raashika and revised by M.Charan Teja under the supervision of Professor Rage Uday Kiran.

    """
    def __init__(self, dataframe):
        """
        Constructor to initialize the KMeans++ object with the given dataframe.
        """
        self.df = dataframe.copy()
        self.df.columns = ['x', 'y'] + list(self.df.columns[2:])
        self.labelsDF = None
        self.centers = None
        self.startTime = None
        self.endTime = None
        self.memoryUSS = None
        self.memoryRSS = None


[docs]
    def getRuntime(self):
        """
        Prints the total runtime of the clustering algorithm.
        """
        print("Total Execution time of proposed Algorithm:", self.endTime - self.startTime, "seconds")



[docs]
    def getMemoryUSS(self):
        """
        Prints the memory usage (USS) of the process in kilobytes.
        """
        print("Memory (USS) of proposed Algorithm in KB:", self.memoryUSS)



[docs]
    def getMemoryRSS(self):
        """
        Prints the memory usage (RSS) of the process in kilobytes.
        """
        print("Memory (RSS) of proposed Algorithm in KB:", self.memoryRSS)



[docs]
    def elbowMethod(self):
        """
        Applies the elbow method to help decide the optimal number of clusters (k).
        It plots WCSS (within-cluster sum of squares) for k in range 1 to 10.
        """
        wcss = []
        k_values = range(1, 11)
        data = self.df.drop(['x', 'y'], axis=1)

        for k in tqdm(k_values):
            kmeans = kmeansAlg(n_clusters=k, random_state=42, n_init=10)
            kmeans.fit(data)
            wcss.append(kmeans.inertia_)
        plt.plot(k_values, wcss, marker='o', linestyle='--')
        plt.xlabel('Number of Clusters (k)')
        plt.ylabel('WCSS')
        plt.title('Elbow Method for Optimal k (Ignoring Location Columns)')
        plt.show()



[docs]
    def run(self, k = 4, max_iter=300):
        """
        Runs KMeans++ clustering on the input dataset using scikit-learn.

        :param k: Number of clusters to form.
        :param max_iter: Maximum number of iterations for a single run.
        :return: A DataFrame with original x, y and cluster labels, and the cluster centers.
        """
        self.startTime = time.time()
        data = self.df.drop(['x', 'y'], axis=1)
        data = data.to_numpy()
        kmeans = kmeansAlg(n_clusters=k, max_iter=max_iter, init='k-means++').fit(data)
        label = self.df[['x', 'y']]
        self.labelsDF = label.assign(labels=kmeans.labels_)
        self.centers = kmeans.cluster_centers_

        self.endTime = time.time()

        process = psutil.Process()
        self.memoryUSS = process.memory_full_info().uss / 1024
        self.memoryRSS = process.memory_full_info().rss / 1024

        return self.labelsDF, self.centers



[docs]
    def save(self, outputFileLabels='KMeansPPLabels.csv', outputFileCenters='KMeansPPCenters.csv'):
        if self.labelsDF is not None:
            try:
                self.labelsDF.to_csv(outputFileLabels, index=False)
                print(f"Labels saved to: {outputFileLabels}")
            except Exception as e:
                print(f"Failed to save labels: {e}")
        else:
            print("No labels to save. Please execute run() method first.")

        if self.centers is not None:
            try:
                pd.DataFrame(self.centers).to_csv(outputFileCenters, index=False)
                print(f"Cluster centers saved to: {outputFileCenters}")
            except Exception as e:
                print(f"Failed to save cluster centers: {e}")
        else:
            print("No cluster centers to save. Please execute run() method first.")