Bug fixes including pubmed-id issue, too long cols, and some flaky te…

…sts (totallylegitco#139) * Misc bug fixes including pubmed-id issue, too long procedures / diagnosis, and try and make tests slightly less brittle. * Remove some hard coded sleeps from the tests * Improve / fix PubMed fax inclusion logic. * Note that it's best effort. * Move some of the selenium stuff into a base. * Move pubmed logic out * Update fax logic to better include pubmed summaries * Fix multi-file-select with client side OCR * Drop exception from testing * Fix type check arround do_article_summary being optional * Style * Fall through to lualatex in the other place to. Maybe move to utils but we only do it twice so far
andrewoswald · Nov 30, 2024 · 0226aed · 0226aed
1 parent 1d32eb7
commit 0226aed
Show file tree

Hide file tree

Showing 18 changed files with 393 additions and 196 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -8,8 +8,8 @@ FROM base-${TARGETARCH}
 
 ARG DEBIAN_FRONTEND=noninteractive
 
-# install nginx
-RUN apt-get update && apt-get upgrade -y && apt-get install nginx vim emacs libmariadb-dev-compat default-libmysqlclient-dev libssl-dev nodejs npm python3-opencv libgl1 tesseract-ocr nano nfs-common sudo iputils-ping hylafax-client pandoc texlive -y
+# install all of the tools we need.
+RUN apt-get update && apt-get upgrade -y && apt-get install nginx vim emacs libmariadb-dev-compat default-libmysqlclient-dev libssl-dev nodejs npm python3-opencv libgl1 tesseract-ocr nano nfs-common sudo iputils-ping hylafax-client pandoc texlive texlive-luatex -y
 COPY /conf/nginx.default /etc/nginx/sites-available/default
 RUN ln -sf /dev/stdout /var/log/nginx/access.log \
     && ln -sf /dev/stderr /var/log/nginx/error.log

diff --git a/fighthealthinsurance/common_view_logic.py b/fighthealthinsurance/common_view_logic.py
@@ -9,6 +9,7 @@
 from django.forms import Form
 from django.http import StreamingHttpResponse
 from django.template.loader import render_to_string
+from django.db.models import QuerySet
 
 import uszipcode
 from fighthealthinsurance.core_forms import *
@@ -20,6 +21,7 @@
 from fighthealthinsurance.question_forms import *
 from fighthealthinsurance.utils import pubmed_fetcher
 import ray
+from .pubmed_tools import PubMedTools
 
 appealGenerator = AppealGenerator()
 
@@ -130,6 +132,7 @@ def stage_appeal_fax(
         name: str,
         insurance_company: str,
         pubmed_articles_to_include: str = "",
+        pubmed_ids_parsed: Optional[List[str]] = None,
     ):
         hashed_email = Denial.get_hashed_email(email)
         # Get the current info
@@ -144,6 +147,7 @@ def stage_appeal_fax(
             "receiver_fax_number": fax_phone,
             "company_name": "Fight Health Insurance -- A service of Totally Legit Co.",
             "company_fax_number": "415-840-7591",
+            "company_phone_number": "202-938-3266",
             "fax_sent_datetime": str(datetime.datetime.now()),
         }
         html_content = render_to_string(
@@ -176,59 +180,24 @@ def stage_appeal_fax(
                 files_for_fax.append(f.name)
                 f.flush()
 
-        pubmed_ids_parsed = pubmed_articles_to_include.split(",")
-        pubmed_docs: list[PubMedArticleSummarized] = []
+        if pubmed_ids_parsed is None:
+            pubmed_ids_parsed = pubmed_articles_to_include.split(",")
+        pmt = PubMedTools()
+        pubmed_docs: list[PubMedArticleSummarized] = pmt.get_articles(pubmed_ids_parsed)
         # Try and include the pubmed ids that we have but also fetch if not present
-        for pmid in pubmed_ids_parsed:
-            if pmid is None or pmid == "":
-                continue
-            try:
-                pubmed_docs.append(PubMedArticleSummarized.objects.get(pmid == pmid))
-            except:
-                try:
-                    fetched = pubmed_fetcher.article_by_pmid(pmid)
-                    article = PubMedArticleSummarized.objects.create(
-                        pmid=pmid,
-                        doi=fetched.doi,
-                        title=fetched.title,
-                        abstract=fetched.abstract,
-                        text=fetched.content.text,
-                    )
-                    pubmed_docs.append(article)
-                except:
-                    print(f"Skipping {pmid}")
-
-        for pubmed_doc in pubmed_docs:
-            with tempfile.NamedTemporaryFile(
-                suffix=".txt", prefix="pubmeddoc", mode="w+t", delete=False
-            ) as f:
-                if pubmed_doc.title is not None:
-                    f.write(pubmed_doc.title + "\n")
-                if pubmed_doc.abstract is not None:
-                    f.write("Abstract:\n")
-                    f.write(pubmed_doc.abstract)
-                if pubmed_doc.text is not None:
-                    f.write("Text:\n")
-                    f.write(pubmed_doc.text)
-                files_for_fax.append(f.name)
-                f.flush()
-
+        pubmed_docs_paths = [
+            x for x in map(pmt.article_as_pdf, pubmed_docs) if x is not None
+        ]
+        files_for_fax.extend(pubmed_docs_paths)
         doc_path = flexible_fax_magic.assemble_single_output(
             input_paths=files_for_fax, extra="", user_header=str(uuid.uuid4())
         )
         doc_fname = os.path.basename(doc_path)
         doc = open(doc_path, "rb")
-        pmids = ""
-        try:
-            pmids = (
-                PubMedQueryData.objects.filter(denial_id=denial_id).get().articles or ""
-            )
-        except:
-            pass
         fts = FaxesToSend.objects.create(
             hashed_email=hashed_email,
             paid=False,
-            pmids=pmids,
+            pmids=json.dumps(pubmed_ids_parsed),
             appeal_text=completed_appeal_text,
             health_history=denial.health_history,
             email=email,
@@ -284,7 +253,9 @@ class ChooseAppealHelper:
     @classmethod
     def choose_appeal(
         cls, denial_id: str, appeal_text: str, email: str, semi_sekret: str
-    ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+    ) -> Tuple[
+        Optional[str], Optional[str], Optional[QuerySet[PubMedArticleSummarized]]
+    ]:
         hashed_email = Denial.get_hashed_email(email)
         # Get the current info
         denial = Denial.objects.filter(
@@ -296,10 +267,14 @@ def choose_appeal(
         pa.save()
         articles = None
         try:
-            pmqd = PubMedQueryData.objects.filter(denial_id=denial_id).get()
+            pmqd = PubMedQueryData.objects.filter(denial_id=denial_id)[0]
             if pmqd.articles is not None:
-                articles = ",".join(pmqd.articles.split(",")[0:2])
-        except:
+                article_ids = json.loads(pmqd.articles)
+                articles = PubMedArticleSummarized.objects.filter(
+                    pmid__in=article_ids
+                ).distinct()
+        except Exception as e:
+            print(f"Error loading pubmed data {e}")
             pass
         return (denial.appeal_fax_number, denial.insurance_company, articles)
 
@@ -405,8 +380,10 @@ def find_next_steps(
         ).get()
         denial.denial_date = denial_date
 
-        denial.procedure = procedure
-        denial.diagnosis = diagnosis
+        if procedure is not None and len(procedure) < 200:
+            denial.procedure = procedure
+        if diagnosis is not None and len(diagnosis) < 200:
+            denial.diagnosis = diagnosis
         if plan_source is not None:
             denial.plan_source.set(plan_source)
         denial.save()
@@ -588,8 +565,10 @@ def create_denial(
         (procedure, diagnosis) = appealGenerator.get_procedure_and_diagnosis(
             denial_text=denial_text, use_external=denial.use_external
         )
-        denial.procedure = procedure
-        denial.diagnosis = diagnosis
+        if procedure is not None and len(procedure) < 200:
+            denial.procedure = procedure
+        if diagnosis is not None and len(diagnosis) < 200:
+            denial.diagnosis = diagnosis
         denial.save()
         r = re.compile(r"Group Name:\s*(.*?)(,|)\s*(INC|CO|LTD|LLC)\s+", re.IGNORECASE)
         g = r.search(denial_text)

diff --git a/fighthealthinsurance/core_forms.py b/fighthealthinsurance/core_forms.py
@@ -83,7 +83,6 @@ class FaxForm(DenialRefForm):
         widget=forms.Textarea(attrs={"class": "appeal_text"}), required=True
     )
     include_provided_health_history = forms.BooleanField(required=False)
-    pubmed_articles_to_include = forms.CharField(required=False)
 
 
 class FaxResendForm(forms.Form):

diff --git a/fighthealthinsurance/fax_utils.py b/fighthealthinsurance/fax_utils.py
@@ -460,7 +460,10 @@ def assemble_single_output(
             command = []
             # Don't double convert pdfs
             if input_path.endswith(".pdf"):
-                merger.append(input_path)
+                try:
+                    merger.append(input_path)
+                except Exception as e:
+                    print(f"Skipping {input_path} due to {e}")
             else:
                 command = ["pandoc", "--wrap=auto", input_path, f"-o{input_path}.pdf"]
                 result = subprocess.run(command)
@@ -484,7 +487,12 @@ def assemble_single_output(
                 if result.returncode == 0:
                     merger.append(f"{input_path}.pdf")
                 else:
-                    print(f"Skipping {input_path} from {result} with {command}")
+                    command.extend(["--pdf-engine=lualatex"])
+                    result = subprocess.run(command)
+                    if result.returncode == 0:
+                        merger.append(f"{input_path}.pdf")
+                    else:
+                        print(f"Skipping {input_path} from {result} with {command}")
         with tempfile.NamedTemporaryFile(
             suffix=".pdf", prefix="alltogether", mode="w+t", delete=False
         ) as t:

diff --git a/fighthealthinsurance/fax_views.py b/fighthealthinsurance/fax_views.py
@@ -56,7 +56,17 @@ class StageFaxView(generic.FormView):
     form_class = FaxForm
 
     def form_valid(self, form):
-        staged = SendFaxHelper.stage_appeal_fax(**form.cleaned_data)
+        form_data = form.cleaned_data
+        # Get all of the articles the user wants to send
+        print(f"Items {list(self.request.POST.items())}")
+        pubmed_checkboxes = [
+            key[len("pubmed_") :]
+            for key, value in self.request.POST.items()
+            if key.startswith("pubmed_") and value == "on"
+        ]
+        form_data["pubmed_ids_parsed"] = pubmed_checkboxes
+        print(f"Staging fax with {form_data}")
+        staged = SendFaxHelper.stage_appeal_fax(**form_data)
         stripe.api_key = settings.STRIPE_API_SECRET_KEY
         stripe.publishable_key = settings.STRIPE_API_PUBLISHABLE_KEY
         product = stripe.Product.create(name="Fax")

diff --git a/fighthealthinsurance/generate_appeal.py b/fighthealthinsurance/generate_appeal.py
@@ -1,27 +1,19 @@
 import itertools
 import json
 import random
-import tempfile
 import time
 import traceback
 from concurrent.futures import Future
 from typing import Any, Iterator, List, Optional, Tuple
 
-import PyPDF2
-import requests
 from fighthealthinsurance.denial_base import DenialBase
 from fighthealthinsurance.exec import *
 from fighthealthinsurance.ml_models import RemoteFullOpenLike, RemoteModelLike
 from fighthealthinsurance.model_router import model_router
-from fighthealthinsurance.models import (
-    PubMedArticleSummarized,
-    PubMedQueryData,
-)
 from fighthealthinsurance.process_denial import *
 from fighthealthinsurance.utils import as_available_nested, pubmed_fetcher
-from metapub import FindIt
-from stopit import ThreadingTimeout as Timeout
 from typing_extensions import reveal_type
+from .pubmed_tools import PubMedTools
 
 
 class AppealTemplateGenerator(object):
@@ -48,6 +40,7 @@ def generate(self, medical_reason):
 class AppealGenerator(object):
     def __init__(self):
         self.regex_denial_processor = ProcessDenialRegex()
+        self.pmt = PubMedTools()
 
     def get_fax_number(self, denial_text=None, use_external=False) -> Optional[str]:
         models_to_try = model_router.entity_extract_backends(use_external)
@@ -73,7 +66,12 @@ def get_procedure_and_diagnosis(
             if procedure_diagnosis is not None:
                 if len(procedure_diagnosis) > 1:
                     procedure = procedure or procedure_diagnosis[0]
+                    # If it's too long then we're probably not valid
+                    if procedure is not None and len(procedure) > 200:
+                        procedure = None
                     diagnosis = diagnosis or procedure_diagnosis[1]
+                    if diagnosis is not None and len(diagnosis) > 200:
+                        diagnosis = None
                 else:
                     print(
                         f"Unexpected procedure diagnosis len on {procedure_diagnosis}"
@@ -130,99 +128,6 @@ def make_open_med_prompt(
         else:
             return None
 
-    def find_more_context(self, denial) -> str:
-        """
-        Kind of hacky RAG routine that uses PubMed.
-        """
-        # PubMed
-        pmids = None
-        pmid_text: list[str] = []
-        article_futures: list[Future[PubMedArticleSummarized]] = []
-        with Timeout(15.0) as timeout_ctx:
-            query = f"{denial.procedure} {denial.diagnosis}"
-            pmids = pubmed_fetcher.pmids_for_query(query)
-            articles_json = json.dumps(pmids)
-            PubMedQueryData.objects.create(
-                query=query,
-                articles=articles_json,
-                denial_id=denial.denial_id,
-            ).save()
-            for article_id in pmids[0:3]:
-                article_futures.append(
-                    pubmed_executor.submit(self.do_article_summary, article_id, query)
-                )
-
-        def article_to_summary(article) -> str:
-            return f"PubMed DOI {article.doi} title {article.title} summary {article.basic_summary}"
-
-        articles: list[PubMedArticleSummarized] = []
-        # Get the articles that we've summarized
-        t = 10
-        for f in article_futures:
-            try:
-                articles.append(f.result(timeout=t))
-                t = t - 1
-            except Exception as e:
-                print(f"Skipping appending article from {f} due to {e} of {type(e)}")
-
-        if len(articles) > 0:
-            return "\n".join(map(article_to_summary, articles))
-        else:
-            return ""
-
-    def do_article_summary(self, article_id, query) -> PubMedArticleSummarized:
-        possible_articles = PubMedArticleSummarized.objects.filter(
-            pmid=article_id,
-            query=query,
-            basic_summary__isnull=False,
-        )[:1]
-        article = None
-        if len(possible_articles) > 0:
-            article = possible_articles[0]
-
-        if article is None:
-            fetched = pubmed_fetcher.article_by_pmid(article_id)
-            src = FindIt(article_id)
-            url = src.url
-            article_text = ""
-            if url is not None:
-                response = requests.get(url)
-                if (
-                    ".pdf" in url
-                    or response.headers.get("Content-Type") == "application/pdf"
-                ):
-                    with tempfile.NamedTemporaryFile(
-                        suffix=".pdf", delete=False
-                    ) as my_data:
-                        my_data.write(response.content)
-
-                        open_pdf_file = open(my_data.name, "rb")
-                        read_pdf = PyPDF2.PdfReader(open_pdf_file)
-                        if read_pdf.is_encrypted:
-                            read_pdf.decrypt("")
-                            for page in read_pdf.pages:
-                                article_text += page.extract_text()
-                        else:
-                            for page in read_pdf.pages:
-                                article_text += page.extract_text()
-                else:
-                    article_text += response.text
-            else:
-                article_text = fetched.content.text
-
-            article = PubMedArticleSummarized.objects.create(
-                pmid=article_id,
-                doi=fetched.doi,
-                title=fetched.title,
-                abstract=fetched.abstract,
-                text=article_text,
-                query=query,
-                basic_summary=model_router.summarize(
-                    query=query, abstract=fetched.abstract, text=article_text
-                ),
-            )
-        return article
-
     def make_appeals(
         self, denial, template_generator, medical_reasons=[], non_ai_appeals=[]
     ) -> Iterator[str]:
@@ -239,7 +144,7 @@ def make_appeals(
 
         pubmed_context = None
         try:
-            pubmed_context = self.find_more_context(denial)
+            pubmed_context = self.pmt.find_context_for_denial(denial)
         except Exception as e:
             print(f"Error {e} looking up context for {denial}.")