Source code for geoanalytics.scoreCalculator.OverlapScore

# OverlapScore Class for Measuring Cluster Overlap Between Two Datasets Using KMeans Clustering

# **Importing and Using the OverlapScore Class in a Python Program**
#
#             import pandas as pd
#
#             from geoanalytics.scoreCalculator import OverlapScore
#
#             train_df = pd.read_csv("train.csv")
#
#             topk_df = pd.read_csv("topk.csv")
#
#             overlap = OverlapScore(train_df, topk_df, startBandTrainDF=2, startBandTopkDF=2)
#
#             score = overlap.run(n_clusters=3)
#

__copyright__ = """
Copyright (C)  2022 Rage Uday Kiran

     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
     the Free Software Foundation, either version 3 of the License, or
     (at your option) any later version.

     This program is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.

     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""

import numpy as np
from sklearn.cluster import KMeans

[docs] class OverlapScore: """ **About this algorithm** :**Description**: OverlapScore quantifies the cluster overlap between two datasets using KMeans clustering. It helps evaluate how well a top-k retrieved set aligns with the training dataset in the embedding space, by checking the agreement of cluster assignments. :**Parameters**: - `TrainDF` (*pd.DataFrame*): The original training dataset. - `TopkDF` (*pd.DataFrame*): The retrieved top-k dataset. - `startBandTrainDF` (*int*): Column index from which to start using features in `TrainDF` (default: 2). - `startBandTopkDF` (*int*): Column index from which to start using features in `TopkDF` (default: 2). :**Attributes**: - **TrainDF** (*np.ndarray*) -- Sliced feature matrix from the training dataset. - **TopkDF** (*np.ndarray*) -- Sliced feature matrix from the top-k dataset. **Execution methods** **Calling from a Python program** .. code-block:: python import pandas as pd from geoanalytics.scoreCalculator import OverlapScore train_df = pd.read_csv("train.csv") topk_df = pd.read_csv("topk.csv") overlap = OverlapScore(train_df, topk_df, startBandTrainDF=2, startBandTopkDF=2) score = overlap.run(n_clusters=3) **Credits** This implementation was created by Raashika and revised by M. Charan Teja under the guidance of Professor Rage Uday Kiran. """ def __init__(self, TrainDF, TopkDF, startBandTrainDF = 2, startBandTopkDF = 2): self.TrainDF = TrainDF.iloc[:, startBandTrainDF:] self.TopkDF = TopkDF.iloc[:, startBandTopkDF:] if self.TrainDF.shape[1] != self.TopkDF.shape[1]: raise ValueError("TrainDF and TopkDF must have the same number of columns after slicing.")
[docs] def run(self, n_clusters=2): """ Computes the cluster overlap score using KMeans clustering. Args: n_clusters (int): Number of clusters to use for KMeans (default: 2). Returns: float: Proportion of top-k samples that belong to the same cluster as the first training sample. """ combined = np.vstack([self.TrainDF, self.TopkDF]) kmeans = KMeans(n_clusters=n_clusters, random_state=42) labels = kmeans.fit_predict(combined) trainCluster = labels[:len(self.TrainDF)] topkCluster = labels[len(self.TrainDF):] overlapScore = np.sum(trainCluster[0] == topkCluster) / len(topkCluster) return overlapScore