text extraction from doc

pierlag · Dec 22, 2022 · 3980cb7 · 3980cb7
1 parent 49efa86
commit 3980cb7
Show file tree

Hide file tree

Showing 4 changed files with 158 additions and 0 deletions.
diff --git a/code/pages/Doc-Extractor.py b/code/pages/Doc-Extractor.py
@@ -0,0 +1,80 @@
+import streamlit as st
+import os, json, re
+from urllib.error import URLError
+import requests
+from utilities.formrecognizer import analyze_read
+from os import path
+from utilities.azureblobstorage import upload_file, get_all_files
+
+def convert_file(fullpath, filename):
+    # Extract the text from the file
+    text = analyze_read(fullpath)
+    # Upload the text to Azure Blob Storage
+    upload_file(text, f'converted/{filename}.txt', 'application/text')
+
+########## START - MAIN ##########
+try:
+    # Set page layout to wide screen and menu item
+    menu_items = {
+	'Get help': None,
+	'Report a bug': None,
+	'About': '''
+	 ## Embeddings App
+
+	Document Reader Sample Demo.
+	'''
+    }
+    st.set_page_config(layout="wide", menu_items=menu_items)
+
+    hide_streamlit_style = """
+                <style>
+                #MainMenu {visibility: hidden;}
+                footer {visibility: hidden;}
+                </style>
+                """
+    st.markdown(hide_streamlit_style, unsafe_allow_html=True) 
+
+
+    uploaded_file = st.file_uploader("Upload a document", type=['pdf'])
+    if uploaded_file is not None:
+        # To read file as bytes:
+        bytes_data = uploaded_file.getvalue()
+
+        if st.session_state.get('filename', '') != uploaded_file.name:
+            # Upload a new file
+            st.session_state['filename'] = uploaded_file.name
+            st.session_state['file_url'] = upload_file(bytes_data, st.session_state['filename'])
+            # Get OCR with Read API
+            st.session_state['text'] = analyze_read(st.session_state['file_url'])
+
+        pdf_display = f'<iframe src="{st.session_state["file_url"]}" width="700" height="1000" type="application/pdf"></iframe>'
+
+    col1, col2, col3 = st.columns([2,1,1])
+
+    files_data = get_all_files()
+
+    cols = st.columns([2,2,1])
+    cols[0].write('Original File')
+    cols[1].write('Converted File')
+    cols[2].write('Convert')
+
+    for x in files_data:
+        col1, col2, col3,  = st.columns([2,2,1])
+        col1.write(f'<a href="{x["fullpath"]}">{x["filename"]}</a>', unsafe_allow_html=True)
+        if x['converted_path'] != '':
+            col2.write(f'<a href="{x["converted_path"]}">{x["filename"]}.txt</a>', unsafe_allow_html=True)
+        if not x['converted']:
+            col3.button('Convert', key=x['filename']+'_button', on_click=convert_file, args= (x['fullpath'],x['filename'],))
+
+
+except URLError as e:
+    st.error(
+        """
+        **This demo requires internet access.**
+
+        Connection error: %s
+        """
+        % e.reason
+    )
+
+########## END - MAIN ##########
diff --git a/code/requirements.txt b/code/requirements.txt
@@ -1,13 +1,19 @@
 altair==4.2.0
 attrs==22.1.0
+azure-ai-formrecognizer==3.2.0
+azure-common==1.1.28
+azure-core==1.26.1
+azure-storage-blob==12.14.1
 blinker==1.5
 cachetools==5.2.0
 certifi==2022.9.24
+cffi==1.15.1
 charset-normalizer==2.1.1
 click==8.1.3
 colorama==0.4.6
 commonmark==0.9.1
 contourpy==1.0.6
+cryptography==38.0.4
 cycler==0.11.0
 decorator==5.1.1
 entrypoints==0.4
@@ -19,13 +25,16 @@ GitPython==3.1.29
 huggingface-hub==0.10.1
 idna==3.4
 importlib-metadata==5.0.0
+isodate==0.6.1
 Jinja2==3.1.2
 joblib==1.2.0
 jsonschema==4.17.0
 kiwisolver==1.4.4
 MarkupSafe==2.1.1
 matplotlib==3.6.2
+msrest==0.7.1
 numpy==1.23.4
+oauthlib==3.2.2
 openai==0.25.0
 openpyxl==3.0.10
 packaging==21.3
@@ -35,6 +44,7 @@ Pillow==9.3.0
 plotly==5.11.0
 protobuf==3.20.3
 pyarrow==10.0.0
+pycparser==2.21
 pydeck==0.8.0
 Pygments==2.13.0
 Pympler==1.0.1
@@ -46,6 +56,7 @@ pytz-deprecation-shim==0.1.0.post0
 PyYAML==6.0
 regex==2022.10.31
 requests==2.28.1
+requests-oauthlib==1.3.1
 rich==12.6.0
 scikit-learn==1.1.3
 scipy==1.9.3

diff --git a/code/utilities/azureblobstorage.py b/code/utilities/azureblobstorage.py
@@ -0,0 +1,51 @@
+import os
+from datetime import datetime, timedelta
+from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, generate_blob_sas, generate_container_sas, ContentSettings
+
+def upload_file(bytes_data, file_name, content_type='application/pdf'):
+    account_name = os.environ['Blob_Account_Name']
+    account_key = os.environ['Blob_Account_Key']
+    connect_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
+    container_name = os.environ['Blob_Container_Name']
+    # Create the BlobServiceClient object
+    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
+    # Create a blob client using the local file name as the name for the blob
+    blob_client = blob_service_client.get_blob_client(container=container_name, blob=file_name)
+    # Upload the created file
+    blob_client.upload_blob(bytes_data,overwrite=True, content_settings=ContentSettings(content_type=content_type))
+
+    return blob_client.url + '?' + generate_blob_sas(account_name, container_name, file_name,account_key=account_key,  permission="r", expiry=datetime.utcnow() + timedelta(hours=3))
+
+def get_all_files():
+    # Get all files in the container from Azure Blob Storage
+    account_name = os.environ['Blob_Account_Name']
+    account_key = os.environ['Blob_Account_Key']
+    connect_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
+    container_name = os.environ['Blob_Container_Name']
+    # Create the BlobServiceClient object
+    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
+    # Get files in the container
+    container_client = blob_service_client.get_container_client(container_name)
+    blob_list = container_client.list_blobs()
+    # sas = generate_blob_sas(account_name, container_name, blob.name,account_key=account_key,  permission="r", expiry=datetime.utcnow() + timedelta(hours=3))
+    sas = generate_container_sas(account_name, container_name,account_key=account_key,  permission="r", expiry=datetime.utcnow() + timedelta(hours=3))
+    files = []
+    converted_files = {}
+    for blob in blob_list:
+        if not blob.name.startswith('converted/'):
+            files.append({
+                "filename" : blob.name,
+                "converted": False, 
+                "fullpath": f"https://{account_name}.blob.core.windows.net/{container_name}/{blob.name}?{sas}",
+                "converted_path": ""
+                })
+        else:
+            converted_files[blob.name] = f"https://{account_name}.blob.core.windows.net/{container_name}/{blob.name}?{sas}"
+
+    for file in files:
+        converted_filename = f"converted/{file['filename']}.txt"
+        if converted_filename in converted_files:
+            file['converted'] = True
+            file['converted_path'] = converted_files[converted_filename]
+
+    return files
diff --git a/code/utilities/formrecognizer.py b/code/utilities/formrecognizer.py
@@ -0,0 +1,16 @@
+from azure.core.credentials import AzureKeyCredential
+from azure.ai.formrecognizer import DocumentAnalysisClient
+import os
+
+def analyze_read(formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/rest-api/read.png"):
+
+    document_analysis_client = DocumentAnalysisClient(
+        endpoint=os.environ['Form_Recognizer_Endpoint'], credential=AzureKeyCredential(os.environ['Form_Recognizer_Key'])
+    )
+
+    poller = document_analysis_client.begin_analyze_document_from_url(
+            "prebuilt-read", formUrl)
+    result = poller.result()
+
+    return result.content
+