Source code for geoanalytics.imputation.MedianImputation
#MedianImputation performs missing value imputation by replacing missing feature values with the median of each corresponding column, while excluding 'x' and 'y' columns and reattaching them after imputation.
#
# **Importing this algorithm into a Python program**
#
# from geoanalytics.imputation import MedianImputation as alg
#
# import pandas as pd
#
# df = pd.read_csv('dataset.csv')
#
# obj = alg.MedianImputation(df)
#
# imputed_df = obj.run()
#
# obj.save('MedianImputation.csv')
#
# obj.getRuntime()
#
# obj.getMemoryUSS()
#
# obj.getMemoryRSS()
#
# print("Data after Median Imputation:", imputed_df)
#
__copyright__ = """
Copyright (C) 2022 Rage Uday Kiran
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
import time
import psutil
from tqdm import tqdm
import pandas as pd
[docs]
class MedianImputation:
"""
**About this algorithm**
:**Description**: MedianImputation replaces missing values in feature columns with their respective medians, excluding 'x' and 'y' spatial columns, and reports runtime and memory usage after execution.
:**Parameters**: - **dataframe** (*pandas.DataFrame*) -- *Input dataset with 'x', 'y' spatial columns followed by numerical features with potential missing values.*
:**Attributes**: - **df** (*pandas.DataFrame*) -- *Internal copy of the original input DataFrame with reordered columns.*
- **imputedDF** (*pandas.DataFrame*) -- *Final DataFrame after applying median imputation.*
- **startTime** (*float*) -- *Start time of the imputation.*
- **endTime** (*float*) -- *End time of the imputation.*
- **memoryUSS** (*float*) -- *Memory usage (USS in KB) during the run.*
- **memoryRSS** (*float*) -- *Memory usage (RSS in KB) during the run.*
**Execution methods**
**Calling from a Python program**
.. code-block:: python
from geoanalytics.imputation import MedianImputation as alg
import pandas as pd
df = pd.read_csv('dataset.csv')
obj = alg.MedianImputation(df)
imputed_df = obj.run()
obj.save('MedianImputation.csv')
obj.getRuntime()
obj.getMemoryUSS()
obj.getMemoryRSS()
print("Data after Median Imputation:", imputed_df)
**Credits**
The complete program was written by and revised by under the supervision of Professor Rage Uday Kiran.
"""
def __init__(self, dataframe):
"""
Initializes the MedianImputation object with a copy of the dataframe.
"""
self.df = dataframe.copy()
self.df.columns = ['x', 'y'] + list(self.df.columns[2:])
self.imputedDF = None
self.startTime = None
self.endTime = None
self.memoryUSS = None
self.memoryRSS = None
[docs]
def getRuntime(self):
"""
Prints the total runtime of the clustering algorithm.
"""
print("Total Execution time of proposed Algorithm:", self.endTime - self.startTime, "seconds")
[docs]
def getMemoryUSS(self):
"""
Prints the memory usage (USS) of the process in kilobytes.
"""
print("Memory (USS) of proposed Algorithm in KB:", self.memoryUSS)
[docs]
def getMemoryRSS(self):
"""
Prints the memory usage (RSS) of the process in kilobytes.
"""
print("Memory (RSS) of proposed Algorithm in KB:", self.memoryRSS)
[docs]
def run(self):
"""
Executes median imputation on the dataset (excluding 'x' and 'y' columns),
and returns the imputed DataFrame with original coordinates.
Returns:
imputedDF : pandas.DataFrame
The DataFrame with missing values imputed using column medians.
"""
self.startTime = time.time()
xy = self.df[['x', 'y']].reset_index(drop=True)
data = self.df.drop(['x', 'y'], axis=1).reset_index(drop=True)
imputedData = data.fillna(data.median())
self.imputedDF = pd.concat([xy, imputedData], axis=1)
self.endTime = time.time()
process = psutil.Process()
self.memoryUSS = process.memory_full_info().uss / 1024
self.memoryRSS = process.memory_full_info().rss / 1024
return self.imputedDF
[docs]
def save(self, outputFile='MedianImputation.csv'):
"""
Saves the imputed DataFrame to a CSV file.
"""
if self.imputedDF is not None:
try:
self.imputedDF.to_csv(outputFile, index=False)
print(f"Imputed data saved to: {outputFile}")
except Exception as e:
print(f"Failed to save labels: {e}")
else:
print("No imputed data to save. Run impute() first")