Merge pull request VikParuchuri#179 from VikParuchuri/dev

Surya OCR 2
isliu11 · Aug 16, 2024 · 8d5affa · 8d5affa
2 parents 8cd024d + e9aa875
commit 8d5affa
Show file tree

Hide file tree

Showing 21 changed files with 1,625 additions and 713 deletions.
diff --git a/README.md b/README.md
@@ -51,7 +51,7 @@ There is a hosted API for all surya models available [here](https://www.datalab.
 
 I want surya to be as widely accessible as possible, while still funding my development/training costs. Research and personal usage is always okay, but there are some restrictions on commercial usage.
 
-The weights for the models are licensed `cc-by-nc-sa-4.0`, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period AND under $5M in lifetime VC/angel funding raised. If you want to remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options [here](https://www.datalab.to).
+The weights for the models are licensed `cc-by-nc-sa-4.0`, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period AND under $5M in lifetime VC/angel funding raised. You also must not be competitive with the [Datalab API](https://www.datalab.to/).  If you want to remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options [here](https://www.datalab.to).
 
 # Installation
 
@@ -84,12 +84,12 @@ surya_gui
 This command will write out a json file with the detected text and bboxes:
 
 ```shell
-surya_ocr DATA_PATH --images --langs hi,en
+surya_ocr DATA_PATH
 ```
 
 - `DATA_PATH` can be an image, pdf, or folder of images/pdfs
-- `--langs` specifies the language(s) to use for OCR.  You can comma separate multiple languages (I don't recommend using more than `4`). Use the language name or two-letter ISO code from [here](https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes).  Surya supports the 90+ languages found in `surya/languages.py`.
-- `--lang_file` if you want to use a different language for different PDFs/images, you can specify languages here.  The format is a JSON dict with the keys being filenames and the values as a list, like `{"file1.pdf": ["en", "hi"], "file2.pdf": ["en"]}`.
+- `--langs` is an optional (but recommended) argument that specifies the language(s) to use for OCR.  You can comma separate multiple languages. Use the language name or two-letter ISO code from [here](https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes).  Surya supports the 90+ languages found in `surya/languages.py`.
+- `--lang_file` if you want to use a different language for different PDFs/images, you can optionally specify languages in a file.  The format is a JSON dict with the keys being filenames and the values as a list, like `{"file1.pdf": ["en", "hi"], "file2.pdf": ["en"]}`.
 - `--images` will save images of the pages and detected text lines (optional)
 - `--results_dir` specifies the directory to save results to instead of the default
 - `--max` specifies the maximum number of pages to process if you don't want to process everything
@@ -108,21 +108,21 @@ The `results.json` file will contain a json dictionary where the keys are the in
 
 **Performance tips**
 
-Setting the `RECOGNITION_BATCH_SIZE` env var properly will make a big difference when using a GPU.  Each batch item will use `50MB` of VRAM, so very high batch sizes are possible.  The default is a batch size `256`, which will use about 12.8GB of VRAM.  Depending on your CPU core count, it may help, too - the default CPU batch size is `32`.
+Setting the `RECOGNITION_BATCH_SIZE` env var properly will make a big difference when using a GPU.  Each batch item will use `40MB` of VRAM, so very high batch sizes are possible.  The default is a batch size `512`, which will use about 20GB of VRAM.  Depending on your CPU core count, it may help, too - the default CPU batch size is `32`.
 
 ### From python
 
 ```python
 from PIL import Image
 from surya.ocr import run_ocr
-from surya.model.detection.model import load_model as load_detection_model, load_processor as load_detection_processor
-from surya.model.recognition.model import load_model as load_recognition_model
-from surya.model.recognition.processor import load_processor as load_recognition_processor
+from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
+from surya.model.recognition.model import load_model as load_rec_model
+from surya.model.recognition.processor import load_processor as load_rec_processor
 
 image = Image.open(IMAGE_PATH)
-langs = ["en"] # Replace with your languages
-det_processor, det_model = load_detection_processor(), load_detection_model()
-rec_model, rec_processor = load_recognition_model(), load_recognition_processor()
+langs = ["en"] # Replace with your languages - optional but recommended
+det_processor, det_model = load_det_processor(), load_det_model()
+rec_model, rec_processor = load_rec_model(), load_rec_processor()
 
 predictions = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)
 ```
@@ -134,15 +134,15 @@ The OCR model can be compiled to get an ~15% speedup in total inference time.  T
 ```python
 import torch
 
-rec_model.decoder.model.decoder = torch.compile(rec_model.decoder.model.decoder)
+rec_model.decoder.model = torch.compile(rec_model.decoder.model)
 ```
 
 ## Text line detection
 
 This command will write out a json file with the detected bboxes.
 
 ```shell
-surya_detect DATA_PATH --images
+surya_detect DATA_PATH
 ```
 
 - `DATA_PATH` can be an image, pdf, or folder of images/pdfs
@@ -184,7 +184,7 @@ predictions = batch_text_detection([image], model, processor)
 This command will write out a json file with the detected layout.
 
 ```shell
-surya_layout DATA_PATH --images
+surya_layout DATA_PATH
 ```
 
 - `DATA_PATH` can be an image, pdf, or folder of images/pdfs
@@ -231,7 +231,7 @@ layout_predictions = batch_layout_detection([image], model, processor, line_pred
 This command will write out a json file with the detected reading order and layout.
 
 ```shell
-surya_order DATA_PATH --images
+surya_order DATA_PATH
 ```
 
 - `DATA_PATH` can be an image, pdf, or folder of images/pdfs
@@ -417,7 +417,9 @@ python benchmark/recognition.py --tesseract
 - `--debug 2` will render images with detected text
 - `--results_dir` will let you specify a directory to save results to instead of the default one
 - `--tesseract` will run the benchmark with tesseract.  You have to run `sudo apt-get install tesseract-ocr-all` to install all tesseract data, and set `TESSDATA_PREFIX` to the path to the tesseract data folder.
+
 - Set `RECOGNITION_BATCH_SIZE=864` to use the same batch size as the benchmark.
+- Set `RECOGNITION_BENCH_DATASET_NAME=vikp/rec_bench_hist` to use the historical document data for benchmarking.  This data comes from the [tapuscorpus](https://github.com/HTR-United/tapuscorpus).
 
 **Layout analysis**
 

diff --git a/benchmark/recognition.py b/benchmark/recognition.py
@@ -30,6 +30,7 @@ def main():
     parser.add_argument("--langs", type=str, help="Specify certain languages to benchmark.", default=None)
     parser.add_argument("--tess_cpus", type=int, help="Number of CPUs to use for tesseract.", default=28)
     parser.add_argument("--compile", action="store_true", help="Compile the model.", default=False)
+    parser.add_argument("--specify_language", action="store_true", help="Pass language codes into the model.", default=False)
     args = parser.parse_args()
 
     if args.compile:
@@ -46,7 +47,7 @@ def main():
 
     if args.langs:
         langs = args.langs.split(",")
-        dataset = dataset.filter(lambda x: x["language"] in langs)
+        dataset = dataset.filter(lambda x: x["language"] in langs, num_proc=4)
 
     images = list(dataset["image"])
     images = convert_if_not_rgb(images)
@@ -62,14 +63,17 @@ def main():
             lang_list.append([l])
         else:
             lang_list.append(l)
+    n_list = [None] * len(images)
 
     if args.compile:
-        rec_model.decoder.model.decoder = torch.compile(rec_model.decoder.model.decoder)
+        torch.set_float32_matmul_precision('high')
+        torch._dynamo.config.cache_size_limit = 64
+        rec_model.decoder.model = torch.compile(rec_model.decoder.model)
         # Run through one batch to compile the model
         run_recognition(images[:1], lang_list[:1], rec_model, rec_processor, bboxes=bboxes[:1])
 
     start = time.time()
-    predictions_by_image = run_recognition(images, lang_list, rec_model, rec_processor, bboxes=bboxes)
+    predictions_by_image = run_recognition(images, lang_list if args.specify_language else n_list, rec_model, rec_processor, bboxes=bboxes)
     surya_time = time.time() - start
 
     surya_scores = defaultdict(list)
@@ -84,9 +88,9 @@ def main():
     flat_surya_scores = [s for l in surya_scores for s in surya_scores[l]]
     benchmark_stats = {
         "surya": {
-            "avg_score": sum(flat_surya_scores) / len(flat_surya_scores),
-            "lang_scores": {l: sum(scores) / len(scores) for l, scores in surya_scores.items()},
-            "time_per_img": surya_time / len(images)
+            "avg_score": sum(flat_surya_scores) / max(1, len(flat_surya_scores)),
+            "lang_scores": {l: sum(scores) / max(1, len(scores)) for l, scores in surya_scores.items()},
+            "time_per_img": surya_time / max(1, len(images))
         }
     }
 
@@ -134,7 +138,7 @@ def main():
         json.dump(benchmark_stats, f)
 
     key_languages = [k for k in KEY_LANGUAGES if k in surya_scores]
-    table_headers = ["Model", "Time per page (s)", "Avg Score"] + KEY_LANGUAGES
+    table_headers = ["Model", "Time per page (s)", "Avg Score"] + key_languages
     table_data = [
         ["surya", benchmark_stats["surya"]["time_per_img"], benchmark_stats["surya"]["avg_score"]] + [benchmark_stats["surya"]["lang_scores"][l] for l in key_languages],
     ]

diff --git a/ocr_app.py b/ocr_app.py
@@ -125,7 +125,7 @@ def page_count(pdf_file):
 """)
 
 in_file = st.sidebar.file_uploader("PDF file or image:", type=["pdf", "png", "jpg", "jpeg", "gif", "webp"])
-languages = st.sidebar.multiselect("Languages", sorted(list(CODE_TO_LANGUAGE.values())), default=["English"], max_selections=4)
+languages = st.sidebar.multiselect("Languages", sorted(list(CODE_TO_LANGUAGE.values())), default=[], max_selections=4, help="Select the languages in the image (if known) to improve OCR accuracy.  Optional.")
 
 if in_file is None:
     st.stop()

diff --git a/ocr_text.py b/ocr_text.py
@@ -1,6 +1,7 @@
 import os
 import argparse
 import json
+import time
 from collections import defaultdict
 
 import torch
@@ -23,12 +24,11 @@ def main():
     parser.add_argument("--max", type=int, help="Maximum number of pages to process.", default=None)
     parser.add_argument("--start_page", type=int, help="Page to start processing at.", default=0)
     parser.add_argument("--images", action="store_true", help="Save images of detected bboxes.", default=False)
-    parser.add_argument("--langs", type=str, help="Language(s) to use for OCR. Comma separate for multiple. Can be a capitalized language name, or a 2-letter ISO 639 code.", default=None)
-    parser.add_argument("--lang_file", type=str, help="Path to file with languages to use for OCR. Should be a JSON dict with file names as keys, and the value being a list of language codes/names.", default=None)
+    parser.add_argument("--langs", type=str, help="Optional language(s) to use for OCR. Comma separate for multiple. Can be a capitalized language name, or a 2-letter ISO 639 code.", default=None)
+    parser.add_argument("--lang_file", type=str, help="Optional path to file with languages to use for OCR. Should be a JSON dict with file names as keys, and the value being a list of language codes/names.", default=None)
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging.", default=False)
     args = parser.parse_args()
 
-    assert args.langs or args.lang_file, "Must provide either --langs or --lang_file"
-
     if os.path.isdir(args.input_path):
         images, names = load_from_folder(args.input_path, args.max, args.start_page)
         folder_name = os.path.basename(args.input_path)
@@ -42,23 +42,29 @@ def main():
         for lang in langs:
             replace_lang_with_code(lang)
         image_langs = langs
-    else:
+    elif args.langs:
         # We got our language settings from the input
         langs = args.langs.split(",")
         replace_lang_with_code(langs)
         image_langs = [langs] * len(images)
+    else:
+        image_langs = [None] * len(images)
 
     det_processor = load_detection_processor()
     det_model = load_detection_model()
 
-    _, lang_tokens = _tokenize("", get_unique_langs(image_langs))
-    rec_model = load_recognition_model(langs=lang_tokens) # Prune model moe layer to only include languages we need
+    rec_model = load_recognition_model()
     rec_processor = load_recognition_processor()
 
     result_path = os.path.join(args.results_dir, folder_name)
     os.makedirs(result_path, exist_ok=True)
 
+    start = time.time()
     predictions_by_image = run_ocr(images, image_langs, det_model, det_processor, rec_model, rec_processor)
+    if args.debug:
+        print(f"OCR took {time.time() - start:.2f} seconds")
+        max_chars = max([len(l.text) for p in predictions_by_image for l in p.text_lines])
+        print(f"Max chars: {max_chars}")
 
     if args.images:
         for idx, (name, image, pred, langs) in enumerate(zip(names, images, predictions_by_image, image_langs)):

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "surya-ocr"
-version = "0.4.15"
+version = "0.5.0"
 description = "OCR, layout, reading order, and line detection in 90+ languages"
 authors = ["Vik Paruchuri <[email protected]>"]
 readme = "README.md"

diff --git a/surya/detection.py b/surya/detection.py
@@ -18,7 +18,7 @@
 def get_batch_size():
     batch_size = settings.DETECTOR_BATCH_SIZE
     if batch_size is None:
-        batch_size = 6
+        batch_size = 8
         if settings.TORCH_DEVICE_MODEL == "mps":
             batch_size = 8
         if settings.TORCH_DEVICE_MODEL == "cuda":

diff --git a/surya/input/processing.py b/surya/input/processing.py
@@ -84,6 +84,8 @@ def slice_bboxes_from_image(image: Image.Image, bboxes):
     lines = []
     for bbox in bboxes:
         line = image.crop((bbox[0], bbox[1], bbox[2], bbox[3]))
+        if line.size[0] == 0:
+            print(f"Warning: found an empty line with bbox {bbox}")
         lines.append(line)
     return lines
 

diff --git a/surya/languages.py b/surya/languages.py
@@ -1,4 +1,5 @@
 CODE_TO_LANGUAGE = {
+    "_math": "Math",
     'af': 'Afrikaans',
     'am': 'Amharic',
     'ar': 'Arabic',