Skip to content

Commit

Permalink
Bug fixes including pubmed-id issue, too long cols, and some flaky te…
Browse files Browse the repository at this point in the history
…sts (totallylegitco#139)

* Misc bug fixes including pubmed-id issue, too long procedures / diagnosis, and try and make tests slightly less brittle.

* Remove some hard coded sleeps from the tests

* Improve / fix PubMed fax inclusion logic.

* Note that it's best effort.

* Move some of the selenium stuff into a base.

* Move pubmed logic out

* Update fax logic to better include pubmed summaries

* Fix multi-file-select with client side OCR

* Drop exception from testing

* Fix type check arround do_article_summary being optional

* Style

* Fall through to lualatex in the other place to. Maybe move to utils but we only do it twice so far
  • Loading branch information
holdenk authored Nov 30, 2024
1 parent 1d32eb7 commit 0226aed
Show file tree
Hide file tree
Showing 18 changed files with 393 additions and 196 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ FROM base-${TARGETARCH}

ARG DEBIAN_FRONTEND=noninteractive

# install nginx
RUN apt-get update && apt-get upgrade -y && apt-get install nginx vim emacs libmariadb-dev-compat default-libmysqlclient-dev libssl-dev nodejs npm python3-opencv libgl1 tesseract-ocr nano nfs-common sudo iputils-ping hylafax-client pandoc texlive -y
# install all of the tools we need.
RUN apt-get update && apt-get upgrade -y && apt-get install nginx vim emacs libmariadb-dev-compat default-libmysqlclient-dev libssl-dev nodejs npm python3-opencv libgl1 tesseract-ocr nano nfs-common sudo iputils-ping hylafax-client pandoc texlive texlive-luatex -y
COPY /conf/nginx.default /etc/nginx/sites-available/default
RUN ln -sf /dev/stdout /var/log/nginx/access.log \
&& ln -sf /dev/stderr /var/log/nginx/error.log
Expand Down
83 changes: 31 additions & 52 deletions fighthealthinsurance/common_view_logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from django.forms import Form
from django.http import StreamingHttpResponse
from django.template.loader import render_to_string
from django.db.models import QuerySet

import uszipcode
from fighthealthinsurance.core_forms import *
Expand All @@ -20,6 +21,7 @@
from fighthealthinsurance.question_forms import *
from fighthealthinsurance.utils import pubmed_fetcher
import ray
from .pubmed_tools import PubMedTools

appealGenerator = AppealGenerator()

Expand Down Expand Up @@ -130,6 +132,7 @@ def stage_appeal_fax(
name: str,
insurance_company: str,
pubmed_articles_to_include: str = "",
pubmed_ids_parsed: Optional[List[str]] = None,
):
hashed_email = Denial.get_hashed_email(email)
# Get the current info
Expand All @@ -144,6 +147,7 @@ def stage_appeal_fax(
"receiver_fax_number": fax_phone,
"company_name": "Fight Health Insurance -- A service of Totally Legit Co.",
"company_fax_number": "415-840-7591",
"company_phone_number": "202-938-3266",
"fax_sent_datetime": str(datetime.datetime.now()),
}
html_content = render_to_string(
Expand Down Expand Up @@ -176,59 +180,24 @@ def stage_appeal_fax(
files_for_fax.append(f.name)
f.flush()

pubmed_ids_parsed = pubmed_articles_to_include.split(",")
pubmed_docs: list[PubMedArticleSummarized] = []
if pubmed_ids_parsed is None:
pubmed_ids_parsed = pubmed_articles_to_include.split(",")
pmt = PubMedTools()
pubmed_docs: list[PubMedArticleSummarized] = pmt.get_articles(pubmed_ids_parsed)
# Try and include the pubmed ids that we have but also fetch if not present
for pmid in pubmed_ids_parsed:
if pmid is None or pmid == "":
continue
try:
pubmed_docs.append(PubMedArticleSummarized.objects.get(pmid == pmid))
except:
try:
fetched = pubmed_fetcher.article_by_pmid(pmid)
article = PubMedArticleSummarized.objects.create(
pmid=pmid,
doi=fetched.doi,
title=fetched.title,
abstract=fetched.abstract,
text=fetched.content.text,
)
pubmed_docs.append(article)
except:
print(f"Skipping {pmid}")

for pubmed_doc in pubmed_docs:
with tempfile.NamedTemporaryFile(
suffix=".txt", prefix="pubmeddoc", mode="w+t", delete=False
) as f:
if pubmed_doc.title is not None:
f.write(pubmed_doc.title + "\n")
if pubmed_doc.abstract is not None:
f.write("Abstract:\n")
f.write(pubmed_doc.abstract)
if pubmed_doc.text is not None:
f.write("Text:\n")
f.write(pubmed_doc.text)
files_for_fax.append(f.name)
f.flush()

pubmed_docs_paths = [
x for x in map(pmt.article_as_pdf, pubmed_docs) if x is not None
]
files_for_fax.extend(pubmed_docs_paths)
doc_path = flexible_fax_magic.assemble_single_output(
input_paths=files_for_fax, extra="", user_header=str(uuid.uuid4())
)
doc_fname = os.path.basename(doc_path)
doc = open(doc_path, "rb")
pmids = ""
try:
pmids = (
PubMedQueryData.objects.filter(denial_id=denial_id).get().articles or ""
)
except:
pass
fts = FaxesToSend.objects.create(
hashed_email=hashed_email,
paid=False,
pmids=pmids,
pmids=json.dumps(pubmed_ids_parsed),
appeal_text=completed_appeal_text,
health_history=denial.health_history,
email=email,
Expand Down Expand Up @@ -284,7 +253,9 @@ class ChooseAppealHelper:
@classmethod
def choose_appeal(
cls, denial_id: str, appeal_text: str, email: str, semi_sekret: str
) -> Tuple[Optional[str], Optional[str], Optional[str]]:
) -> Tuple[
Optional[str], Optional[str], Optional[QuerySet[PubMedArticleSummarized]]
]:
hashed_email = Denial.get_hashed_email(email)
# Get the current info
denial = Denial.objects.filter(
Expand All @@ -296,10 +267,14 @@ def choose_appeal(
pa.save()
articles = None
try:
pmqd = PubMedQueryData.objects.filter(denial_id=denial_id).get()
pmqd = PubMedQueryData.objects.filter(denial_id=denial_id)[0]
if pmqd.articles is not None:
articles = ",".join(pmqd.articles.split(",")[0:2])
except:
article_ids = json.loads(pmqd.articles)
articles = PubMedArticleSummarized.objects.filter(
pmid__in=article_ids
).distinct()
except Exception as e:
print(f"Error loading pubmed data {e}")
pass
return (denial.appeal_fax_number, denial.insurance_company, articles)

Expand Down Expand Up @@ -405,8 +380,10 @@ def find_next_steps(
).get()
denial.denial_date = denial_date

denial.procedure = procedure
denial.diagnosis = diagnosis
if procedure is not None and len(procedure) < 200:
denial.procedure = procedure
if diagnosis is not None and len(diagnosis) < 200:
denial.diagnosis = diagnosis
if plan_source is not None:
denial.plan_source.set(plan_source)
denial.save()
Expand Down Expand Up @@ -588,8 +565,10 @@ def create_denial(
(procedure, diagnosis) = appealGenerator.get_procedure_and_diagnosis(
denial_text=denial_text, use_external=denial.use_external
)
denial.procedure = procedure
denial.diagnosis = diagnosis
if procedure is not None and len(procedure) < 200:
denial.procedure = procedure
if diagnosis is not None and len(diagnosis) < 200:
denial.diagnosis = diagnosis
denial.save()
r = re.compile(r"Group Name:\s*(.*?)(,|)\s*(INC|CO|LTD|LLC)\s+", re.IGNORECASE)
g = r.search(denial_text)
Expand Down
1 change: 0 additions & 1 deletion fighthealthinsurance/core_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ class FaxForm(DenialRefForm):
widget=forms.Textarea(attrs={"class": "appeal_text"}), required=True
)
include_provided_health_history = forms.BooleanField(required=False)
pubmed_articles_to_include = forms.CharField(required=False)


class FaxResendForm(forms.Form):
Expand Down
12 changes: 10 additions & 2 deletions fighthealthinsurance/fax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,10 @@ def assemble_single_output(
command = []
# Don't double convert pdfs
if input_path.endswith(".pdf"):
merger.append(input_path)
try:
merger.append(input_path)
except Exception as e:
print(f"Skipping {input_path} due to {e}")
else:
command = ["pandoc", "--wrap=auto", input_path, f"-o{input_path}.pdf"]
result = subprocess.run(command)
Expand All @@ -484,7 +487,12 @@ def assemble_single_output(
if result.returncode == 0:
merger.append(f"{input_path}.pdf")
else:
print(f"Skipping {input_path} from {result} with {command}")
command.extend(["--pdf-engine=lualatex"])
result = subprocess.run(command)
if result.returncode == 0:
merger.append(f"{input_path}.pdf")
else:
print(f"Skipping {input_path} from {result} with {command}")
with tempfile.NamedTemporaryFile(
suffix=".pdf", prefix="alltogether", mode="w+t", delete=False
) as t:
Expand Down
12 changes: 11 additions & 1 deletion fighthealthinsurance/fax_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,17 @@ class StageFaxView(generic.FormView):
form_class = FaxForm

def form_valid(self, form):
staged = SendFaxHelper.stage_appeal_fax(**form.cleaned_data)
form_data = form.cleaned_data
# Get all of the articles the user wants to send
print(f"Items {list(self.request.POST.items())}")
pubmed_checkboxes = [
key[len("pubmed_") :]
for key, value in self.request.POST.items()
if key.startswith("pubmed_") and value == "on"
]
form_data["pubmed_ids_parsed"] = pubmed_checkboxes
print(f"Staging fax with {form_data}")
staged = SendFaxHelper.stage_appeal_fax(**form_data)
stripe.api_key = settings.STRIPE_API_SECRET_KEY
stripe.publishable_key = settings.STRIPE_API_PUBLISHABLE_KEY
product = stripe.Product.create(name="Fax")
Expand Down
111 changes: 8 additions & 103 deletions fighthealthinsurance/generate_appeal.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,19 @@
import itertools
import json
import random
import tempfile
import time
import traceback
from concurrent.futures import Future
from typing import Any, Iterator, List, Optional, Tuple

import PyPDF2
import requests
from fighthealthinsurance.denial_base import DenialBase
from fighthealthinsurance.exec import *
from fighthealthinsurance.ml_models import RemoteFullOpenLike, RemoteModelLike
from fighthealthinsurance.model_router import model_router
from fighthealthinsurance.models import (
PubMedArticleSummarized,
PubMedQueryData,
)
from fighthealthinsurance.process_denial import *
from fighthealthinsurance.utils import as_available_nested, pubmed_fetcher
from metapub import FindIt
from stopit import ThreadingTimeout as Timeout
from typing_extensions import reveal_type
from .pubmed_tools import PubMedTools


class AppealTemplateGenerator(object):
Expand All @@ -48,6 +40,7 @@ def generate(self, medical_reason):
class AppealGenerator(object):
def __init__(self):
self.regex_denial_processor = ProcessDenialRegex()
self.pmt = PubMedTools()

def get_fax_number(self, denial_text=None, use_external=False) -> Optional[str]:
models_to_try = model_router.entity_extract_backends(use_external)
Expand All @@ -73,7 +66,12 @@ def get_procedure_and_diagnosis(
if procedure_diagnosis is not None:
if len(procedure_diagnosis) > 1:
procedure = procedure or procedure_diagnosis[0]
# If it's too long then we're probably not valid
if procedure is not None and len(procedure) > 200:
procedure = None
diagnosis = diagnosis or procedure_diagnosis[1]
if diagnosis is not None and len(diagnosis) > 200:
diagnosis = None
else:
print(
f"Unexpected procedure diagnosis len on {procedure_diagnosis}"
Expand Down Expand Up @@ -130,99 +128,6 @@ def make_open_med_prompt(
else:
return None

def find_more_context(self, denial) -> str:
"""
Kind of hacky RAG routine that uses PubMed.
"""
# PubMed
pmids = None
pmid_text: list[str] = []
article_futures: list[Future[PubMedArticleSummarized]] = []
with Timeout(15.0) as timeout_ctx:
query = f"{denial.procedure} {denial.diagnosis}"
pmids = pubmed_fetcher.pmids_for_query(query)
articles_json = json.dumps(pmids)
PubMedQueryData.objects.create(
query=query,
articles=articles_json,
denial_id=denial.denial_id,
).save()
for article_id in pmids[0:3]:
article_futures.append(
pubmed_executor.submit(self.do_article_summary, article_id, query)
)

def article_to_summary(article) -> str:
return f"PubMed DOI {article.doi} title {article.title} summary {article.basic_summary}"

articles: list[PubMedArticleSummarized] = []
# Get the articles that we've summarized
t = 10
for f in article_futures:
try:
articles.append(f.result(timeout=t))
t = t - 1
except Exception as e:
print(f"Skipping appending article from {f} due to {e} of {type(e)}")

if len(articles) > 0:
return "\n".join(map(article_to_summary, articles))
else:
return ""

def do_article_summary(self, article_id, query) -> PubMedArticleSummarized:
possible_articles = PubMedArticleSummarized.objects.filter(
pmid=article_id,
query=query,
basic_summary__isnull=False,
)[:1]
article = None
if len(possible_articles) > 0:
article = possible_articles[0]

if article is None:
fetched = pubmed_fetcher.article_by_pmid(article_id)
src = FindIt(article_id)
url = src.url
article_text = ""
if url is not None:
response = requests.get(url)
if (
".pdf" in url
or response.headers.get("Content-Type") == "application/pdf"
):
with tempfile.NamedTemporaryFile(
suffix=".pdf", delete=False
) as my_data:
my_data.write(response.content)

open_pdf_file = open(my_data.name, "rb")
read_pdf = PyPDF2.PdfReader(open_pdf_file)
if read_pdf.is_encrypted:
read_pdf.decrypt("")
for page in read_pdf.pages:
article_text += page.extract_text()
else:
for page in read_pdf.pages:
article_text += page.extract_text()
else:
article_text += response.text
else:
article_text = fetched.content.text

article = PubMedArticleSummarized.objects.create(
pmid=article_id,
doi=fetched.doi,
title=fetched.title,
abstract=fetched.abstract,
text=article_text,
query=query,
basic_summary=model_router.summarize(
query=query, abstract=fetched.abstract, text=article_text
),
)
return article

def make_appeals(
self, denial, template_generator, medical_reasons=[], non_ai_appeals=[]
) -> Iterator[str]:
Expand All @@ -239,7 +144,7 @@ def make_appeals(

pubmed_context = None
try:
pubmed_context = self.find_more_context(denial)
pubmed_context = self.pmt.find_context_for_denial(denial)
except Exception as e:
print(f"Error {e} looking up context for {denial}.")

Expand Down
Loading

0 comments on commit 0226aed

Please sign in to comment.