Merge pull request #18 from Menghuan1918/dev

Dev
NoEdgeAI · Jul 5, 2024 · faa63e8 · faa63e8
2 parents 6728f47 + c9a998f
commit faa63e8
Show file tree

Hide file tree

Showing 13 changed files with 302 additions and 145 deletions.
diff --git a/README.md b/README.md
@@ -28,7 +28,11 @@ pdfdeal
 
 Check out the [task list](https://github.com/users/Menghuan1918/projects/3) to see what new features are in the works!
 
-### V0.1.4
+### V0.1.4/V0.1.5
+
+#### 🐛 Bug Fixes
+
+- Fix error when customizing output folders in some cases: `os.rename error - system cannot move files to other disks`
 
 #### 🚀 Other
 

diff --git a/README_CN.md b/README_CN.md
@@ -28,7 +28,11 @@ pdfdeal
 
 你可以[在此处](https://github.com/users/Menghuan1918/projects/3)查看正在开发的新功能！如果直接使用Doc2X进行转换请参阅[Doc2x支持](./docs/doc2x_cn.md)。
 
-### V0.1.4
+### V0.1.4/V0.1.5
+
+#### 🐛 Bug 修复
+
+- 修复某些情况下自定义输出文件夹时会出现错误：`os.rename错误-系统无法将文件移动到其他磁盘`
 
 #### 🚀 其他
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "pdfdeal"
-version = "0.1.4"
+version = "0.1.5"
 authors = [{ name = "Menghuan1918", email = "[email protected]" }]
 description = "Easier to deal with PDF, extract readable text and OCR to recognise image text and clean the format. Make it more suitable for knowledge base construction. Best performance with Doc2X."
 readme = "README.md"

diff --git a/src/pdfdeal/FileTools/OCR/doc2x.py b/src/pdfdeal/FileTools/OCR/doc2x.py
@@ -0,0 +1,85 @@
+from PIL import Image
+import os
+from typing import Tuple
+
+def doc2x_judgements(image_file):
+    """
+    Whether the image is samll enough to be enable purely formulaic model
+    """
+    with Image.open(image_file) as img:
+        size = img.size
+    if size[0] < 50 and size[1] < 50:
+        return 1
+    else:
+        return 0
+
+def Doc2X_OCR(Client):
+    """
+    OCR with Doc2X
+    """
+    try:
+        limit = Client.get_limit()
+    except Exception as e:
+        raise Exception(f"Get error! {e}")
+    if limit == 0:
+        raise Exception("The Doc2X limit is 0, please check your account.")
+
+    def OCR(path, language=["ch_sim", "en"], GPU=False) -> Tuple[str, bool]:
+        text = ""
+        All_Done = True
+        if os.path.isfile(path) and path.endswith((".jpg", ".png", ".jpeg")):
+            try:
+                equation = doc2x_judgements(image_file=path)
+                texts, Failed, Fail_flag = Client.pic2file(
+                    image_file=path,
+                    output_format="txts",
+                    equation=equation,
+                    version="v2",
+                )
+                for t in texts:
+                    text += t + "\n"
+                if Fail_flag:
+                    for fail in Failed:
+                        if fail["error"] != "":
+                            print(
+                                f"Get error when using Doc2X to do ocr. {fail['error']}"
+                            )
+                    All_Done = False
+            except Exception as e:
+                print(
+                    f"Get error when using Doc2X to do ocr and pass to next file. {e}"
+                )
+                All_Done = False
+                pass
+        elif os.path.isdir(path):
+            for root, dirs, files in os.walk(path):
+                for file in files:
+                    file_path = os.path.join(root, file)
+                    if file_path.endswith((".jpg", ".png", ".jpeg")):
+                        try:
+                            # * Since the size and dimensions of each image may be different, batch processing mode is not used
+                            equation = doc2x_judgements(image_file=file_path)
+                            texts, Failed, Fail_flag = Client.pic2file(
+                                image_file=path,
+                                output_format="txts",
+                                equation=equation,
+                                version="v2",
+                            )
+                            for t in texts:
+                                text += t + "\n"
+                            if Fail_flag:
+                                for fail in Failed:
+                                    if fail["error"] != "":
+                                        print(
+                                            f"Get error when using Doc2X to do ocr. {fail['error']}"
+                                        )
+                                All_Done = False
+                        except Exception as e:
+                            print(
+                                f"Get error when using Doc2X to do ocr and pass to next file. {e}"
+                            )
+                            All_Done = False
+                            pass
+        return text, All_Done
+
+    return OCR
diff --git a/src/pdfdeal/FileTools/OCR/easyocr.py b/src/pdfdeal/FileTools/OCR/easyocr.py
@@ -0,0 +1,38 @@
+import os
+from typing import Tuple
+
+
+def OCR_easyocr(path, language=["ch_sim", "en"], GPU=False) -> Tuple[str, bool]:
+    """
+    OCR with easyocr
+    """
+    try:
+        import easyocr
+    except ImportError:
+        raise ImportError("Please install easyocr first, use 'pip install easyocr'")
+    reader = easyocr.Reader(language, gpu=GPU)
+    All_Done = True
+    texts = ""
+    if os.path.isfile(path) and path.endswith((".jpg", ".png", ".jpeg")):
+        try:
+            result = reader.readtext(path, detail=0, paragraph=True)
+        except Exception as e:
+            result = [""]
+            All_Done = False
+            print(f"Get error when using easyocr to do ocr and pass to next file. {e}")
+        texts += "\n".join(result)
+    elif os.path.isdir(path):
+        for root, dirs, files in os.walk(path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                if file_path.endswith((".jpg", ".png", ".jpeg")):
+                    try:
+                        result = reader.readtext(file_path, detail=0, paragraph=True)
+                    except Exception as e:
+                        result = [""]
+                        All_Done = False
+                        print(
+                            f"Get error when using easyocr to do ocr and pass to next file. {e}"
+                        )
+                    texts += "\n".join(result)
+    return texts, All_Done
diff --git a/src/pdfdeal/FileTools/OCR/pytesseract.py b/src/pdfdeal/FileTools/OCR/pytesseract.py
@@ -0,0 +1,40 @@
+import os
+from typing import Tuple
+
+def OCR_pytesseract(path, language=["eng"], GPU=False) -> Tuple[str, bool]:
+    """
+    OCR with pytesseract
+    """
+    try:
+        import pytesseract
+    except ImportError:
+        raise ImportError(
+            "Please install pytesseract first, use 'pip install pytesseract'"
+        )
+    text = ""
+    All_Done = True
+    if os.path.isfile(path) and path.endswith((".jpg", ".png", ".jpeg")):
+        try:
+            text += pytesseract.image_to_string(path, lang=language[0])
+            text += "\n"
+        except Exception as e:
+            print(
+                f"Get error when using pytesseract to do ocr and pass to next file. {e}"
+            )
+            All_Done = False
+            pass
+    elif os.path.isdir(path):
+        for root, dirs, files in os.walk(path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                if file_path.endswith((".jpg", ".png", ".jpeg")):
+                    try:
+                        text += pytesseract.image_to_string(file_path, lang=language[0])
+                        text += "\n"
+                    except Exception as e:
+                        print(
+                            f"Get error when using pytesseract to do ocr and pass to next file. {e}"
+                        )
+                        All_Done = False
+                        pass
+    return text, All_Done
diff --git a/src/pdfdeal/FileTools/Tool/doc2x.py b/src/pdfdeal/FileTools/Tool/doc2x.py
@@ -0,0 +1,18 @@
+from typing import Tuple, Callable
+
+
+def Doc2X_Tool(Client) -> Callable:
+    """
+    deal pdf file with Doc2X
+    """
+    try:
+        limit = Client.get_limit()
+    except Exception as e:
+        raise Exception(f"Get error! {e}")
+    if limit == 0:
+        raise Exception("The Doc2X limit is 0, please check your account.")
+
+    def Tool(path: str, options: dict) -> Tuple[list, list, bool]:
+        return Client.pdfdeal(input=path, path=options["output"], version="v2")
+
+    return Tool
diff --git a/src/pdfdeal/FileTools/file_tools.py b/src/pdfdeal/FileTools/file_tools.py
@@ -7,6 +7,7 @@
 import os
 import zipfile
 from .ocr import OCR_easyocr
+import shutil
 
 
 def clean_text(text):
@@ -179,6 +180,7 @@ def list_rename(files: list, new_name: list) -> list:
             new_files.append("")
             continue
         new_file = os.path.join(os.path.dirname(file), name)
-        os.rename(file, new_file)
+        os.makedirs(new_file, exist_ok=True)
+        shutil.move(file, new_file)
         new_files.append(new_file)
-    return new_files
+    return new_files