Skip to content

Commit

Permalink
text extraction from doc
Browse files Browse the repository at this point in the history
  • Loading branch information
ruoccofabrizio committed Dec 22, 2022
1 parent 49efa86 commit 3980cb7
Show file tree
Hide file tree
Showing 4 changed files with 158 additions and 0 deletions.
80 changes: 80 additions & 0 deletions code/pages/Doc-Extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import streamlit as st
import os, json, re
from urllib.error import URLError
import requests
from utilities.formrecognizer import analyze_read
from os import path
from utilities.azureblobstorage import upload_file, get_all_files

def convert_file(fullpath, filename):
# Extract the text from the file
text = analyze_read(fullpath)
# Upload the text to Azure Blob Storage
upload_file(text, f'converted/{filename}.txt', 'application/text')

########## START - MAIN ##########
try:
# Set page layout to wide screen and menu item
menu_items = {
'Get help': None,
'Report a bug': None,
'About': '''
## Embeddings App
Document Reader Sample Demo.
'''
}
st.set_page_config(layout="wide", menu_items=menu_items)

hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)


uploaded_file = st.file_uploader("Upload a document", type=['pdf'])
if uploaded_file is not None:
# To read file as bytes:
bytes_data = uploaded_file.getvalue()

if st.session_state.get('filename', '') != uploaded_file.name:
# Upload a new file
st.session_state['filename'] = uploaded_file.name
st.session_state['file_url'] = upload_file(bytes_data, st.session_state['filename'])
# Get OCR with Read API
st.session_state['text'] = analyze_read(st.session_state['file_url'])

pdf_display = f'<iframe src="{st.session_state["file_url"]}" width="700" height="1000" type="application/pdf"></iframe>'

col1, col2, col3 = st.columns([2,1,1])

files_data = get_all_files()

cols = st.columns([2,2,1])
cols[0].write('Original File')
cols[1].write('Converted File')
cols[2].write('Convert')

for x in files_data:
col1, col2, col3, = st.columns([2,2,1])
col1.write(f'<a href="{x["fullpath"]}">{x["filename"]}</a>', unsafe_allow_html=True)
if x['converted_path'] != '':
col2.write(f'<a href="{x["converted_path"]}">{x["filename"]}.txt</a>', unsafe_allow_html=True)
if not x['converted']:
col3.button('Convert', key=x['filename']+'_button', on_click=convert_file, args= (x['fullpath'],x['filename'],))


except URLError as e:
st.error(
"""
**This demo requires internet access.**
Connection error: %s
"""
% e.reason
)

########## END - MAIN ##########
11 changes: 11 additions & 0 deletions code/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
altair==4.2.0
attrs==22.1.0
azure-ai-formrecognizer==3.2.0
azure-common==1.1.28
azure-core==1.26.1
azure-storage-blob==12.14.1
blinker==1.5
cachetools==5.2.0
certifi==2022.9.24
cffi==1.15.1
charset-normalizer==2.1.1
click==8.1.3
colorama==0.4.6
commonmark==0.9.1
contourpy==1.0.6
cryptography==38.0.4
cycler==0.11.0
decorator==5.1.1
entrypoints==0.4
Expand All @@ -19,13 +25,16 @@ GitPython==3.1.29
huggingface-hub==0.10.1
idna==3.4
importlib-metadata==5.0.0
isodate==0.6.1
Jinja2==3.1.2
joblib==1.2.0
jsonschema==4.17.0
kiwisolver==1.4.4
MarkupSafe==2.1.1
matplotlib==3.6.2
msrest==0.7.1
numpy==1.23.4
oauthlib==3.2.2
openai==0.25.0
openpyxl==3.0.10
packaging==21.3
Expand All @@ -35,6 +44,7 @@ Pillow==9.3.0
plotly==5.11.0
protobuf==3.20.3
pyarrow==10.0.0
pycparser==2.21
pydeck==0.8.0
Pygments==2.13.0
Pympler==1.0.1
Expand All @@ -46,6 +56,7 @@ pytz-deprecation-shim==0.1.0.post0
PyYAML==6.0
regex==2022.10.31
requests==2.28.1
requests-oauthlib==1.3.1
rich==12.6.0
scikit-learn==1.1.3
scipy==1.9.3
Expand Down
51 changes: 51 additions & 0 deletions code/utilities/azureblobstorage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
from datetime import datetime, timedelta
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, generate_blob_sas, generate_container_sas, ContentSettings

def upload_file(bytes_data, file_name, content_type='application/pdf'):
account_name = os.environ['Blob_Account_Name']
account_key = os.environ['Blob_Account_Key']
connect_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
container_name = os.environ['Blob_Container_Name']
# Create the BlobServiceClient object
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
# Create a blob client using the local file name as the name for the blob
blob_client = blob_service_client.get_blob_client(container=container_name, blob=file_name)
# Upload the created file
blob_client.upload_blob(bytes_data,overwrite=True, content_settings=ContentSettings(content_type=content_type))

return blob_client.url + '?' + generate_blob_sas(account_name, container_name, file_name,account_key=account_key, permission="r", expiry=datetime.utcnow() + timedelta(hours=3))

def get_all_files():
# Get all files in the container from Azure Blob Storage
account_name = os.environ['Blob_Account_Name']
account_key = os.environ['Blob_Account_Key']
connect_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
container_name = os.environ['Blob_Container_Name']
# Create the BlobServiceClient object
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
# Get files in the container
container_client = blob_service_client.get_container_client(container_name)
blob_list = container_client.list_blobs()
# sas = generate_blob_sas(account_name, container_name, blob.name,account_key=account_key, permission="r", expiry=datetime.utcnow() + timedelta(hours=3))
sas = generate_container_sas(account_name, container_name,account_key=account_key, permission="r", expiry=datetime.utcnow() + timedelta(hours=3))
files = []
converted_files = {}
for blob in blob_list:
if not blob.name.startswith('converted/'):
files.append({
"filename" : blob.name,
"converted": False,
"fullpath": f"https://{account_name}.blob.core.windows.net/{container_name}/{blob.name}?{sas}",
"converted_path": ""
})
else:
converted_files[blob.name] = f"https://{account_name}.blob.core.windows.net/{container_name}/{blob.name}?{sas}"

for file in files:
converted_filename = f"converted/{file['filename']}.txt"
if converted_filename in converted_files:
file['converted'] = True
file['converted_path'] = converted_files[converted_filename]

return files
16 changes: 16 additions & 0 deletions code/utilities/formrecognizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import os

def analyze_read(formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/rest-api/read.png"):

document_analysis_client = DocumentAnalysisClient(
endpoint=os.environ['Form_Recognizer_Endpoint'], credential=AzureKeyCredential(os.environ['Form_Recognizer_Key'])
)

poller = document_analysis_client.begin_analyze_document_from_url(
"prebuilt-read", formUrl)
result = poller.result()

return result.content

0 comments on commit 3980cb7

Please sign in to comment.