Skip to content

Commit

Permalink
Read PDFs using Wiley API
Browse files Browse the repository at this point in the history
  • Loading branch information
breuleux committed Dec 6, 2024
1 parent 955afe8 commit c844625
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 12 deletions.
1 change: 1 addition & 0 deletions src/paperoni/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class PaperoniTokens:
elsevier: str = None
springer: str = None
zeta_alpha: str = None
wiley: str = None


@dataclass
Expand Down
18 changes: 8 additions & 10 deletions src/paperoni/fulltext/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from ..config import papconf
from ..utils import download
from .locate import find_download_links, ua
from .locate import URL, find_download_links, ua
from .pdfanal import to_plain


Expand Down Expand Up @@ -41,10 +41,9 @@ class ErrorData:


@dataclass
class URLResult:
class DownloadResult:
ref: str
url: str
info: str
url: URL
downloaded: bool = False
error: Optional[ErrorData] = None

Expand All @@ -54,7 +53,7 @@ class Metadata:
identifier: str
title: str
success: bool
sources: list[URLResult]
sources: list[DownloadResult]


class PDF:
Expand Down Expand Up @@ -103,9 +102,9 @@ def fetch_link(self, src):
pdf = self.pdf_path
try:
download(
url=src.url,
url=src.url.url,
filename=self.pdf_path,
headers={"User-Agent": ua.random},
headers={"User-Agent": ua.random, **src.url.headers},
)
src.downloaded = True
except Exception as exc:
Expand Down Expand Up @@ -147,9 +146,8 @@ def fetch(self):
self.initialize_meta()
for ref in self.refs:
for url in find_download_links(ref):
src = URLResult(
url=url.url,
info=url.info,
src = DownloadResult(
url=url,
ref=ref,
downloaded=False,
error=None,
Expand Down
14 changes: 12 additions & 2 deletions src/paperoni/fulltext/locate.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import Literal

import requests
Expand All @@ -16,6 +15,17 @@
class URL:
url: str
info: str
headers: dict[str, str] = field(default_factory=dict)

def readable(self):
hd = requests.head(
self.url, headers={"User-Agent": ua.random, **self.headers}
)
try:
hd.raise_for_status()
except Exception:
return False
return True


@ovld
Expand Down

0 comments on commit c844625

Please sign in to comment.