-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_encoding.py
68 lines (56 loc) · 2.18 KB
/
check_encoding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""
This script checks the encoding of a file.
"""
import os
import argparse
import chardet
from pipeline import logger
from pipeline import FileUtils
def check_encoding(file_path) -> dict:
"""
Checks the encoding of a file.
Parameters:
file_path (str): The path to the file to check.
Returns:
dict: A dictionary with the encoding result or an error message.
"""
result = {}
if not os.path.isfile(file_path):
logger.error("Error: The file %s does not exist.", file_path)
result = {"error": "File not found"}
try:
# Check file size to avoid processing excessively large files
file_size = os.path.getsize(file_path)
if file_size > 10 * 1024 * 1024: # 10 MB limit
logger.error("Error: The file %s is too large to process.", file_path)
result = {"error": "File too large"}
with open(file_path, 'rb') as f:
raw_data = bytearray()
while chunk := f.read(8192):
raw_data.extend(chunk)
result = chardet.detect(raw_data)
except FileNotFoundError:
logger.error("Error: The file %s was not found.", file_path)
result = {"error": "File not found"}
except PermissionError:
logger.error("Error: Permission denied for file %s.", file_path)
result = {"error": "Permission denied"}
except OSError as e:
logger.error("OS error occurred: %s", e)
result = {"error": str(e)}
return result
def main(files_to_check):
""" Main function to check the encoding of files. """
for file_path in files_to_check:
result = check_encoding(file_path)
logger.info("%s: %s", file_path, result)
positions = FileUtils.find_non_ascii_bytes(file_path)
if positions:
logger.info("Non-ASCII bytes found in file: %s", file_path)
logger.info(positions)
FileUtils.clean_non_ascii_positions(file_path, positions)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Check the encoding of files.")
parser.add_argument('files', metavar='F', type=str, nargs='+', help='Files to check')
args = parser.parse_args()
main(args.files)