diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a37273b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +files/ diff --git a/README.md b/README.md index 5547206..cf47dcb 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,36 @@ -# Compress ECG byte +# Compress Files ## Instructions -After downloading the source code, ensure you have Python and Python Flask installed. -Then, run `python main.py` and navigate to `localhost:8888`. +1. Ensure Python and [Flask](https://flask.palletsprojects.com/) are installed +3. Run `python main.py` +4. Navigate to http://localhost:8888 ## Design Considerations and Key Decisions -After doing some initial research, I learned there's no standardized format for ECG files. -Therefore, there's nothing special we can do in terms of optimizing our compression algorithm for specific headers, special data structures, etc. +The project consisted of both a frontend and backend component. The frontend only required a straightforward web interface solely for file upload and display of compressed file details, thus a static HTML file sufficed. -The project would require both a frontend and backend component. The frontend only required a simple web interface to upload the file and display information about the resulting compressed file. +For the backend, a REST API was essential to facilitate file uploads and subsequent downloads of compressed files: +- `/api/upload_file` + - Accepts HTTP POST requests containing the file to compress + - Returns a JSON object including fields: `{"compressed_size":INT,"file_id":STR,"original_size":INT}` + - Essential information for user display and compressed file retrieval +- `/api/download_file/` + - Accepts HTTP GET requests + - Retrieves the compressed file corresponding to `FILE_ID` -For the backend, I went with Python Flask because it is the Python framework I have had the most experience with. -When a user uploads a file, the backend will save the uploaded file to a specific input folder and immediately compress it into a separate output folder. -Each file would be associated with a unique id (UUIDv4 for it's randomness) to prevent filename clashes. +Python Flask was selected for the backend due to its suitability for developing REST APIs. -This application could benefit from multithreading if there was enough load and big enough files as each file can be independently processed, but that is out of scope for this project. +Upon file upload, the backend saves and compresses the file, subsequently deleting the original. Each file is uniquely identified using a UUIDv4 to prevent naming conflicts. + +### Other considerations + +For scalability, multithreading could enhance performance with increased load and larger files, but this was beyond the project's current scope. + +Decompression functionality was not required for this project. However, if needed, decompression details could be stored server-side (e.g., in JSON files linked to each file ID) or within the compressed file itself. ## Compression Algorithm -I chose to go with a simple Huffman Coding algorithm because it is a true and tested algorithm. The only difficult part was deciding whether to make each node 3-bytes or 1-byte, but it was much better to go with 1-byte because it only has 256 possible combinations. \ No newline at end of file +Research revealed no standardized format for ECG files, limiting optimization options for headers or data structures. + +The Huffman Coding algorithm was chosen for its proven reliability. Opting for a 1-byte node representation was determined to be optimal despite initially considering for 3-byte nodes, owing to the algorithm's efficiency with only 256 possible combinations. \ No newline at end of file diff --git a/main.py b/main.py index 7651899..04753cf 100644 --- a/main.py +++ b/main.py @@ -9,23 +9,19 @@ import shutil from flask import Flask, current_app, request, send_file app = Flask(__name__) -INPUT_DIR = "./input/" -OUTPUT_DIR = "./output/" +FILES_DIR = "./files/" def init(): - if os.path.exists(INPUT_DIR): - shutil.rmtree(INPUT_DIR) - if os.path.exists(OUTPUT_DIR): - shutil.rmtree(OUTPUT_DIR) - os.mkdir(INPUT_DIR) - os.mkdir(OUTPUT_DIR) + if os.path.exists(FILES_DIR): + shutil.rmtree(FILES_DIR) + os.mkdir(FILES_DIR) def main(): init() app.run(host="0.0.0.0", debug=False, port=8888) """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -FLASK REST API +REST API """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" @app.route("/", methods=["GET"]) @@ -40,6 +36,9 @@ def upload_file(): file_id = save_file(request.files['file']) compression_info = compress(file_id) + input_path = get_input_path(file_id) + os.remove(input_path) + response = { "file_id": file_id, "original_size": compression_info.input_size, @@ -78,19 +77,11 @@ def compress(file_id: str) -> CompressionInfo: return CompressionInfo(input_size, output_size) -def cleanup(file_id: str): - """Clean up any files created relating to FILE_ID""" - input_path = get_input_path(file_id) - output_path = get_output_path(file_id) - - os.remove(input_path) - os.remove(output_path) - def get_input_path(file_id) -> str: - return f"{INPUT_DIR}/{file_id}" + return f"{FILES_DIR}/{file_id}-input" def get_output_path(file_id) -> str: - return f"{OUTPUT_DIR}/{file_id}" + return f"{FILES_DIR}/{file_id}-compressed" """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" HUFFMAN CODING @@ -112,7 +103,7 @@ class TreeNode: def get_codes(input_file: str) -> dict[str, str]: """ Given INPUT_FILE, read its contents and return a dictionary containing a - code for each byte. Uses Huffman coding + unique code for each unique byte. Uses Huffman coding """ # Get frequency of every byte in the file char_count = {} @@ -153,26 +144,28 @@ def get_codes(input_file: str) -> dict[str, str]: queue.append((node.right, code + '1')) return codes -def to_bytes(data): +def to_bytes(data: str): """ - Helper function to convert DATA to valid bytes + Helper function to convert DATA (string of 0's and 1's) to valid bytes """ b = bytearray() for i in range(0, len(data), 8): - b.append(int(data[i:i+8], 2)) + bits = data[i:i+8] + "0000000" # So that we have at least 8 bits + b.append(int(bits[0:8], 2)) + return bytes(b) def write_compressed_file(input_file: str, output_file: str, codes: dict[str, str]): """ Compress contents of INPUT_FILE to OUTPUT_FILE using CODES """ + data_str = "" with open(input_file, "rb") as input_file: - data_str = "" byte = input_file.read(1) while byte != b"": data_str += codes[byte] byte = input_file.read(1) - data_bytes = to_bytes(data_str) + data_bytes = to_bytes(data_str) with open(output_file, "wb") as output_file: output_file.write(data_bytes)