Source code for geoanalytics.scoreCalculator.SilhouetteScore

# SilhouetteScore Class for Evaluating Cluster Separation Between Training and Top-k Datasets

# **Importing and Using the SilhouetteScore Class in a Python Program**
#
#             import pandas as pd
#
#             from geoanalytics.scoreCalculator import SilhouetteScore
#
#             train_df = pd.read_csv("train.csv")
#
#             topk_df = pd.read_csv("topk.csv")
#
#             scorer = SilhouetteScore(train_df, topk_df, startBandTrainDF=2, startBandTopkDF=2)
#
#             score = scorer.run()
#

__copyright__ = """
Copyright (C)  2022 Rage Uday Kiran

     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
     the Free Software Foundation, either version 3 of the License, or
     (at your option) any later version.

     This program is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.

     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""

import numpy as np
from sklearn.metrics import silhouette_score


[docs]
class SilhouetteScore:
    """
    **About this algorithm**

    :**Description**:
        SilhouetteScore evaluates how well-separated the top-k retrieved data points are from the training dataset
        using the silhouette coefficient. This is useful for validating retrieval performance and cluster consistency
        between two groups.

    :**Parameters**:
        - `TrainDF` (*pd.DataFrame*): The original training dataset.
        - `TopkDF` (*pd.DataFrame*): The retrieved top-k dataset.
        - `startBandTrainDF` (*int*): Column index from which to extract features from `TrainDF` (default: 2).
        - `startBandTopkDF` (*int*): Column index from which to extract features from `TopkDF` (default: 2).

    :**Attributes**:
        - **TrainDF** (*np.ndarray*) -- Extracted features from the training dataset.
        - **TopkDF** (*np.ndarray*) -- Extracted features from the top-k dataset.

    **Execution methods**

    **Calling from a Python program**

    .. code-block:: python

            import pandas as pd

            from geoanalytics.scoreCalculator import SilhouetteScore

            train_df = pd.read_csv("train.csv")

            topk_df = pd.read_csv("topk.csv")

            scorer = SilhouetteScore(train_df, topk_df, startBandTrainDF=2, startBandTopkDF=2)

            score = scorer.run()

    **Credits**

    This implementation was created by Raashika and revised by M. Charan Teja under the guidance of Professor Rage Uday Kiran.
    """

    def __init__(self, TrainDF, TopkDF, startBandTrainDF = 2, startBandTopkDF = 2):
        self.TrainDF = TrainDF.iloc[:, startBandTrainDF:]
        self.TopkDF = TopkDF.iloc[:, startBandTopkDF:]

        if self.TrainDF.shape[1] != self.TopkDF.shape[1]:
            raise ValueError("TrainDF and TopkDF must have the same number of columns after slicing.")


[docs]
    def run(self):
        """
        Computes the silhouette score for the two combined datasets.

        Returns:
            float: Silhouette score indicating the separation between training and top-k data points.
        """
        combined = np.vstack([self.TrainDF, self.TopkDF])
        labels = np.array([1]*len(self.TrainDF) + [0]*len(self.TopkDF))
        score = silhouette_score(combined, labels)
        return score