# csv2Parquet converts the input CSV file to a data frame, which is then transformed into a Parquet file.
#
# **Importing this algorithm into a python program**
#
# from PAMI.extras.convert import csvParquet as cp
#
# obj = cp.CSV2Parquet(sampleDB.csv, output.parquet, sep)
#
# obj.convert()
#
# obj.printStats()
#
__copyright__ = """
Copyright (C) 2021 Rage Uday Kiran
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
import sys
import pandas as pd
import os
import psutil
import time
[docs]
class CSV2Parquet:
"""
**About this algorithm**
:**Description**: This class is to convert CSV files into Parquet format.
:**Reference**:
:**Parameters**: - **inputFile** (*str*) -- *Path to the input CSV file.*
- **outputFile** (*str*) -- *Path to the output Parquet file.*
- **sep** (*str*) -- *This variable is used to distinguish items from one another. The default seperator is tab space. However, the users can override their default separator.*
:**Attributes**: - **getMemoryUSS** (*float*) -- *Returns the memory used by the process in USS.*
- **getMemoryRSS** (*float*) -- *Returns the memory used by the process in RSS.*
- **getRuntime()** (*float*) -- *Returns the time taken to execute the conversion.*
- **printStats()** -- *Prints statistics about memory usage and runtime.*
:**Methods**: - **convert()** -- *Reads the input file, converts it to a Parquet file, and tracks memory usage and runtime.*
**Execution methods**
**Terminal command**
.. code-block:: console
Format:
(.venv) $ python3 CSV2Parquet.py <inputFile> <outputFile> <sep>
Example Usage:
(.venv) $ python3 CSV2Parquet.py sampleDB.csv output.parquet \t
**Calling from a python program**
.. code-block:: python
import PAMI.extras.convert.CSV2Parquet as cp
inputFile = 'sampleDB.csv'
sep = "\t"
outputFile = 'output.parquet'
obj = cp.CSV2Parquet(inputFile, outputFile, sep)
obj.convert()
obj.printStats()
**Credits**
The complete program was written by P. Likhitha and revised by Tarun Sreepada under the supervision of Professor Rage Uday Kiran.
"""
def __init__(self, inputFile, outputFile, sep):
self.inputFile = inputFile
self.outputFile = outputFile
self.sep = sep
self.start = None
self.end = None
self.pid = None
self.memoryUSS = float()
self.memoryRSS = float()
[docs]
def convert(self):
"""
This function converts the input CSV file to a data frame, which is then transformed into a Parquet file.
"""
self.start = time.time()
file = []
maxLen = 0
with open(self.inputFile, "r") as f:
for line in f:
file.append(line.strip().split(self.sep))
maxLen = max(maxLen, len(file[-1]))
for i in range(len(file)):
file[i] += [""] * (maxLen - len(file[i]))
df = pd.DataFrame(file)
df.to_parquet(self.outputFile)
self.end = time.time()
self.pid = os.getpid()
process = psutil.Process(self.pid)
self.memoryUSS = process.memory_full_info().uss
self.memoryRSS = process.memory_info().rss
[docs]
def getMemoryUSS(self):
"""
Returns the memory used by the process in USS (Unique Set Size).
:return: The amount of memory (in bytes) used exclusively by the process
:rtype: int
"""
return self.memoryUSS
[docs]
def getRuntime(self):
"""
Returns the time taken to complete the CSV to Parquet conversion.
:return: The runtime of the conversion process in seconds.
:rtype: float
"""
return self.end - self.start
[docs]
def printStats(self):
"""
Prints the resource usage statistics including memory consumption (USS and RSS) and the runtime.
:return: Prints memory usage and runtime to the console.
"""
print("Memory usage (USS):", self.memoryUSS)
print("Memory usage (RSS):", self.memoryRSS)
print("Runtime:", self.end - self.start)
if __name__ == '__main__':
#file = "Transactional_T10I4D100K.csv"
#sep = "\t"
#outputFile = "output.parquet"
if len(sys.argv) == 4:
obj = CSV2Parquet(sys.argv[1], sys.argv[2], sys.argv[3])
else:
raise ValueError("Invalid number of arguments. Args: <inputFile> <outputFile> <separator>")
#obj.convert()
#obj.printStats()