Source code for geoanalytics.imputation.HotDeck

# HotDeck-based missing value imputation for multidimensional data with runtime and memory tracking,
# and support for saving the imputed dataset.
#
# **Importing and Using the HotDeck Class in a Python Program**
#
#             import pandas as pd
#
#             from geoanalytics.imputation import HotDeck
#
#             df = pd.read_csv('input.csv')
#
#             imputer = HotDeck(df)
#
#             imputed_df = imputer.run()
#
#             imputer.getRuntime()
#
#             imputer.getMemoryUSS()
#
#             imputer.getMemoryRSS()
#
#             imputer.save('HotDeck.csv')
#

__copyright__ = """
Copyright (C)  2022 Rage Uday Kiran

     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
     the Free Software Foundation, either version 3 of the License, or
     (at your option) any later version.

     This program is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.

     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""

import time
import psutil
from tqdm import tqdm
import pandas as pd


[docs]
class HotDeck:
    """
    **About this algorithm**

    :**Description**:
        Hot Deck Imputation is a simple, non-parametric method for handling missing values by
        replacing each missing entry with a randomly selected observed value from the same column.
        This method is suitable for datasets where missing values are not patterned or large in volume.

    :**Parameters**:
        - Dataset (pandas DataFrame) must be provided during object initialization.

    :**Attributes**:
        - **df** (*pd.DataFrame*) -- The input data with 'x', 'y' coordinates and features.
        - **imputedDF** (*pd.DataFrame*) -- DataFrame after filling in missing values.
        - **startTime, endTime** (*float*) -- Variables to track execution time.
        - **memoryUSS, memoryRSS** (*float*) -- Memory usage of the imputation process in kilobytes.

    **Execution methods**

    **Calling from a Python program**

    .. code-block:: python

            import pandas as pd

            from geoanalytics.imputation import HotDeck

            df = pd.read_csv("input.csv")

            imputer = HotDeck(df)

            imputed_df = imputer.run()

            imputer.getRuntime()
            imputer.getMemoryUSS()
            imputer.getMemoryRSS()

            imputer.save('HotDeck.csv')

    **Credits**

    This implementation was created by Raashika and revised by M.Charan Teja
    under the guidance of Professor Rage Uday Kiran.
    """

    def __init__(self, dataframe):
        self.df = dataframe.copy()
        self.df.columns = ['x', 'y'] + list(self.df.columns[2:])
        self.imputedDF = None
        self.startTime = None
        self.endTime = None
        self.memoryUSS = None
        self.memoryRSS = None


[docs]
    def getRuntime(self):
        """
        Prints the total runtime of the clustering algorithm.
        """
        print("Total Execution time of proposed Algorithm:", self.endTime - self.startTime, "seconds")



[docs]
    def getMemoryUSS(self):
        """
        Prints the memory usage (USS) of the process in kilobytes.
        """
        print("Memory (USS) of proposed Algorithm in KB:", self.memoryUSS)



[docs]
    def getMemoryRSS(self):
        """
        Prints the memory usage (RSS) of the process in kilobytes.
        """
        print("Memory (RSS) of proposed Algorithm in KB:", self.memoryRSS)




[docs]
    def run(self):
        """
        Executes the Hot Deck Imputation algorithm by replacing missing values with randomly
        selected non-missing values from the same column.

        :return: imputedDF (pd.DataFrame) -- DataFrame with missing values filled
        """
        self.startTime = time.time()
        xy = self.df[['x', 'y']].reset_index(drop=True)
        data = self.df.drop(['x', 'y'], axis=1).reset_index(drop=True)

        for col in tqdm(data.columns, desc="Hot Deck Imputation"):
            missing_indices = data[col][data[col].isnull()].index
            non_missing_values = data[col][data[col].notnull()]

            if not non_missing_values.empty:
                for idx in missing_indices:
                    data.at[idx, col] = non_missing_values.sample().iloc[0]

        self.imputedDF = pd.concat([xy, data], axis=1)

        self.endTime = time.time()

        process = psutil.Process()
        self.memoryUSS = process.memory_full_info().uss / 1024
        self.memoryRSS = process.memory_full_info().rss / 1024

        return self.imputedDF




[docs]
    def save(self, outputFile='HotDeck.csv'):
        """
        Saves the imputed DataFrame to a CSV file.

        :param outputFile: str, filename to save the imputed data (default: 'HotDeck.csv')
        """
        if self.imputedDF is not None:
            try:
                self.imputedDF.to_csv(outputFile, index=False)
                print(f"Imputed data saved to: {outputFile}")
            except Exception as e:
                print(f"Failed to save labels: {e}")
        else:
            print("No imputed data to save. Run impute() first")