Skip to content

Commit

Permalink
Merge pull request #18 from Menghuan1918/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
Menghuan1918 authored Jul 5, 2024
2 parents 6728f47 + c9a998f commit faa63e8
Show file tree
Hide file tree
Showing 13 changed files with 302 additions and 145 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,11 @@ pdfdeal

Check out the [task list](https://github.com/users/Menghuan1918/projects/3) to see what new features are in the works!

### V0.1.4
### V0.1.4/V0.1.5

#### 🐛 Bug Fixes

- Fix error when customizing output folders in some cases: `os.rename error - system cannot move files to other disks`

#### 🚀 Other

Expand Down
6 changes: 5 additions & 1 deletion README_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,11 @@ pdfdeal

你可以[在此处](https://github.com/users/Menghuan1918/projects/3)查看正在开发的新功能!如果直接使用Doc2X进行转换请参阅[Doc2x支持](./docs/doc2x_cn.md)

### V0.1.4
### V0.1.4/V0.1.5

#### 🐛 Bug 修复

- 修复某些情况下自定义输出文件夹时会出现错误:`os.rename错误-系统无法将文件移动到其他磁盘`

#### 🚀 其他

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "pdfdeal"
version = "0.1.4"
version = "0.1.5"
authors = [{ name = "Menghuan1918", email = "[email protected]" }]
description = "Easier to deal with PDF, extract readable text and OCR to recognise image text and clean the format. Make it more suitable for knowledge base construction. Best performance with Doc2X."
readme = "README.md"
Expand Down
85 changes: 85 additions & 0 deletions src/pdfdeal/FileTools/OCR/doc2x.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from PIL import Image
import os
from typing import Tuple

def doc2x_judgements(image_file):
"""
Whether the image is samll enough to be enable purely formulaic model
"""
with Image.open(image_file) as img:
size = img.size
if size[0] < 50 and size[1] < 50:
return 1
else:
return 0

def Doc2X_OCR(Client):
"""
OCR with Doc2X
"""
try:
limit = Client.get_limit()
except Exception as e:
raise Exception(f"Get error! {e}")
if limit == 0:
raise Exception("The Doc2X limit is 0, please check your account.")

def OCR(path, language=["ch_sim", "en"], GPU=False) -> Tuple[str, bool]:
text = ""
All_Done = True
if os.path.isfile(path) and path.endswith((".jpg", ".png", ".jpeg")):
try:
equation = doc2x_judgements(image_file=path)
texts, Failed, Fail_flag = Client.pic2file(
image_file=path,
output_format="txts",
equation=equation,
version="v2",
)
for t in texts:
text += t + "\n"
if Fail_flag:
for fail in Failed:
if fail["error"] != "":
print(
f"Get error when using Doc2X to do ocr. {fail['error']}"
)
All_Done = False
except Exception as e:
print(
f"Get error when using Doc2X to do ocr and pass to next file. {e}"
)
All_Done = False
pass
elif os.path.isdir(path):
for root, dirs, files in os.walk(path):
for file in files:
file_path = os.path.join(root, file)
if file_path.endswith((".jpg", ".png", ".jpeg")):
try:
# * Since the size and dimensions of each image may be different, batch processing mode is not used
equation = doc2x_judgements(image_file=file_path)
texts, Failed, Fail_flag = Client.pic2file(
image_file=path,
output_format="txts",
equation=equation,
version="v2",
)
for t in texts:
text += t + "\n"
if Fail_flag:
for fail in Failed:
if fail["error"] != "":
print(
f"Get error when using Doc2X to do ocr. {fail['error']}"
)
All_Done = False
except Exception as e:
print(
f"Get error when using Doc2X to do ocr and pass to next file. {e}"
)
All_Done = False
pass
return text, All_Done

return OCR
38 changes: 38 additions & 0 deletions src/pdfdeal/FileTools/OCR/easyocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os
from typing import Tuple


def OCR_easyocr(path, language=["ch_sim", "en"], GPU=False) -> Tuple[str, bool]:
"""
OCR with easyocr
"""
try:
import easyocr
except ImportError:
raise ImportError("Please install easyocr first, use 'pip install easyocr'")
reader = easyocr.Reader(language, gpu=GPU)
All_Done = True
texts = ""
if os.path.isfile(path) and path.endswith((".jpg", ".png", ".jpeg")):
try:
result = reader.readtext(path, detail=0, paragraph=True)
except Exception as e:
result = [""]
All_Done = False
print(f"Get error when using easyocr to do ocr and pass to next file. {e}")
texts += "\n".join(result)
elif os.path.isdir(path):
for root, dirs, files in os.walk(path):
for file in files:
file_path = os.path.join(root, file)
if file_path.endswith((".jpg", ".png", ".jpeg")):
try:
result = reader.readtext(file_path, detail=0, paragraph=True)
except Exception as e:
result = [""]
All_Done = False
print(
f"Get error when using easyocr to do ocr and pass to next file. {e}"
)
texts += "\n".join(result)
return texts, All_Done
40 changes: 40 additions & 0 deletions src/pdfdeal/FileTools/OCR/pytesseract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os
from typing import Tuple

def OCR_pytesseract(path, language=["eng"], GPU=False) -> Tuple[str, bool]:
"""
OCR with pytesseract
"""
try:
import pytesseract
except ImportError:
raise ImportError(
"Please install pytesseract first, use 'pip install pytesseract'"
)
text = ""
All_Done = True
if os.path.isfile(path) and path.endswith((".jpg", ".png", ".jpeg")):
try:
text += pytesseract.image_to_string(path, lang=language[0])
text += "\n"
except Exception as e:
print(
f"Get error when using pytesseract to do ocr and pass to next file. {e}"
)
All_Done = False
pass
elif os.path.isdir(path):
for root, dirs, files in os.walk(path):
for file in files:
file_path = os.path.join(root, file)
if file_path.endswith((".jpg", ".png", ".jpeg")):
try:
text += pytesseract.image_to_string(file_path, lang=language[0])
text += "\n"
except Exception as e:
print(
f"Get error when using pytesseract to do ocr and pass to next file. {e}"
)
All_Done = False
pass
return text, All_Done
18 changes: 18 additions & 0 deletions src/pdfdeal/FileTools/Tool/doc2x.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from typing import Tuple, Callable


def Doc2X_Tool(Client) -> Callable:
"""
deal pdf file with Doc2X
"""
try:
limit = Client.get_limit()
except Exception as e:
raise Exception(f"Get error! {e}")
if limit == 0:
raise Exception("The Doc2X limit is 0, please check your account.")

def Tool(path: str, options: dict) -> Tuple[list, list, bool]:
return Client.pdfdeal(input=path, path=options["output"], version="v2")

return Tool
6 changes: 4 additions & 2 deletions src/pdfdeal/FileTools/file_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os
import zipfile
from .ocr import OCR_easyocr
import shutil


def clean_text(text):
Expand Down Expand Up @@ -179,6 +180,7 @@ def list_rename(files: list, new_name: list) -> list:
new_files.append("")
continue
new_file = os.path.join(os.path.dirname(file), name)
os.rename(file, new_file)
os.makedirs(new_file, exist_ok=True)
shutil.move(file, new_file)
new_files.append(new_file)
return new_files
return new_files
Loading

0 comments on commit faa63e8

Please sign in to comment.