Merge branch 'TideDra:main' into main

hbzcnc · Feb 26, 2025 · ccb29ab · ccb29ab
2 parents c3b3eb5 + b2a7804
commit ccb29ab
Show file tree

Hide file tree

Showing 8 changed files with 28 additions and 13 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -36,5 +36,6 @@ jobs:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           OPENAI_API_BASE: ${{ secrets.OPENAI_API_BASE }}
           MODEL_NAME: ${{ secrets.MODEL_NAME }}
+          LANGUAGE: ${{ vars.LANGUAGE }}
         run: |
           uv run main.py
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -34,5 +34,6 @@ jobs:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           OPENAI_API_BASE: ${{ secrets.OPENAI_API_BASE }}
           MODEL_NAME: ${{ secrets.MODEL_NAME }}
+          LANGUAGE: ${{ vars.LANGUAGE }}
         run: |
           uv run main.py --debug
diff --git a/README.md b/README.md
@@ -12,12 +12,13 @@
   [![GitHub Issues](https://img.shields.io/github/issues/TideDra/zotero-arxiv-daily)](https://github.com/TideDra/zotero-arxiv-daily/issues)
   [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/TideDra/zotero-arxiv-daily)](https://github.com/TideDra/zotero-arxiv-daily/pulls)
   [![License](https://img.shields.io/github/license/TideDra/zotero-arxiv-daily)](/LICENSE)
+  [<img src="https://api.gitsponsors.com/api/badge/img?id=893025857" height="20">](https://api.gitsponsors.com/api/badge/link?p=PKMtRut1dWWuC1oFdJweyDSvJg454/GkdIx4IinvBblaX2AY4rQ7FYKAK1ZjApoiNhYEeduIEhfeZVIwoIVlvcwdJXVFD2nV2EE5j6lYXaT/RHrcsQbFl3aKe1F3hliP26OMayXOoZVDidl05wj+yg==)
 
 </div>
 
 ---
 
-<p align="center"> Recommend new arxiv papers of your interest daily according to your Zotero libarary.
+<p align="center"> Recommend new arxiv papers of your interest daily according to your Zotero library.
     <br> 
 </p>
 
@@ -28,7 +29,7 @@
 
 > Track new scientific researches of your interest by just forking (and staring) this repo!😊
 
-*Zotero-arXiv-Daily* finds arxiv papers that may attract you based on the context of your Zotero libarary, and then sends the result to your mailbox📮. It can be deployed as Github Action Workflow with **zero cost**, **no installation**, and **few configuration** of Github Action environment variables for daily **automatic** delivery.
+*Zotero-arXiv-Daily* finds arxiv papers that may attract you based on the context of your Zotero library, and then sends the result to your mailbox📮. It can be deployed as Github Action Workflow with **zero cost**, **no installation**, and **few configuration** of Github Action environment variables for daily **automatic** delivery.
 
 ## ✨ Features
 - Totally free! All the calculation can be done in the Github Action runner locally within its quota (for public repo).
@@ -78,6 +79,7 @@ There are also some public variables (Repository Variables) you can set, which a
 | ZOTERO_IGNORE | | str | Gitignore-style patterns marking the Zotero collections that should be ignored. One rule one line. Learn more about [gitignore](https://git-scm.com/docs/gitignore). | AI Agent/<br>**/survey<br>!LLM/survey |
 | REPOSITORY | | str | The repository that provides the workflow. If set, the value can only be `TideDra/zotero-arxiv-daily`, in which case, the workflow always pulls the latest code from this upstream repo, so that you don't need to sync your forked repo upon each update, unless the workflow file is changed. | `TideDra/zotero-arxiv-daily` |
 | REF | | str | The specified ref of the workflow to run. Only valid when REPOSITORY is set to `TideDra/zotero-arxiv-daily`. Currently supported values include `main` for stable version, `dev` for development version which has new features and potential bugs. | `main` |
+| LANGUAGE | | str | The language of TLDR; Its value is directly embeded in the prompt passed to LLM | Chinese |
 
 That's all! Now you can test the workflow by manually triggering it:
 ![test](./assets/test.png)
@@ -111,7 +113,7 @@ This project is in active development. You can subscribe this repo via `Watch` s
 
 
 ## 📖 How it works
-*Zotero-arXiv-Daily* firstly retrieves all the papers in your Zotero libarary and all the papers released in the previous day, via corresponding API. Then it calculates the embedding of each paper's abstract via an embedding model. The score of a paper is its weighted average similarity over all your Zotero papers (newer paper added to the libarary has higher weight).
+*Zotero-arXiv-Daily* firstly retrieves all the papers in your Zotero library and all the papers released in the previous day, via corresponding API. Then it calculates the embedding of each paper's abstract via an embedding model. The score of a paper is its weighted average similarity over all your Zotero papers (newer paper added to the library has higher weight).
 
 The TLDR of each paper is generated by a lightweight LLM (Qwen2.5-3b-instruct-q4_k_m), given its title, abstract, introduction, and conclusion (if any). The introduction and conclusion are extracted from the source latex file of the paper.
 

diff --git a/llm.py b/llm.py
@@ -5,7 +5,7 @@
 GLOBAL_LLM = None
 
 class LLM:
-    def __init__(self, api_key: str = None, base_url: str = None, model: str = None):
+    def __init__(self, api_key: str = None, base_url: str = None, model: str = None,lang: str = "English"):
         if api_key:
             self.llm = OpenAI(api_key=api_key, base_url=base_url)
         else:
@@ -17,6 +17,7 @@ def __init__(self, api_key: str = None, base_url: str = None, model: str = None)
                 verbose=False,
             )
         self.model = model
+        self.lang = lang
 
     def generate(self, messages: list[dict]) -> str:
         if isinstance(self.llm, OpenAI):
@@ -26,9 +27,9 @@ def generate(self, messages: list[dict]) -> str:
             response = self.llm.create_chat_completion(messages=messages,temperature=0)
             return response["choices"][0]["message"]["content"]
 
-def set_global_llm(api_key: str = None, base_url: str = None, model: str = None):
+def set_global_llm(api_key: str = None, base_url: str = None, model: str = None, lang: str = "English"):
     global GLOBAL_LLM
-    GLOBAL_LLM = LLM(api_key=api_key, base_url=base_url, model=model)
+    GLOBAL_LLM = LLM(api_key=api_key, base_url=base_url, model=model, lang=lang)
 
 def get_llm() -> LLM:
     if GLOBAL_LLM is None:

diff --git a/main.py b/main.py
@@ -135,6 +135,12 @@ def get_env(key:str,default=None):
         help="LLM Model Name",
         default="gpt-4o",
     )
+    add_argument(
+        "--language",
+        type=str,
+        help="Language of TLDR",
+        default="English",
+    )
     parser.add_argument('--debug', action='store_true', help='Debug mode')
     args = parser.parse_args()
     assert (
@@ -168,10 +174,10 @@ def get_env(key:str,default=None):
             papers = papers[:args.max_paper_num]
         if args.use_llm_api:
             logger.info("Using OpenAI API as global LLM.")
-            set_global_llm(api_key=args.openai_api_key, base_url=args.openai_api_base, model=args.model_name)
+            set_global_llm(api_key=args.openai_api_key, base_url=args.openai_api_base, model=args.model_name, lang=args.language)
         else:
             logger.info("Using Local LLM as global LLM.")
-            set_global_llm()
+            set_global_llm(lang=args.language)
 
     html = render_email(papers)
     logger.info("Sending email...")

diff --git a/paper.py b/paper.py
@@ -154,13 +154,15 @@ def tldr(self) -> str:
             match = re.search(r'\\section\{Conclusion\}.*?(\\section|\\end\{document\}|\\bibliography|\\appendix|$)', content, flags=re.DOTALL)
             if match:
                 conclusion = match.group(0)
-        prompt = """Given the title, abstract, introduction and the conclusion (if any) of a paper in latex format, generate a one-sentence TLDR summary:
+        llm = get_llm()
+        prompt = """Given the title, abstract, introduction and the conclusion (if any) of a paper in latex format, generate a one-sentence TLDR summary in __LANG__:
         
         \\title{__TITLE__}
         \\begin{abstract}__ABSTRACT__\\end{abstract}
         __INTRODUCTION__
         __CONCLUSION__
         """
+        prompt = prompt.replace('__LANG__', llm.lang)
         prompt = prompt.replace('__TITLE__', self.title)
         prompt = prompt.replace('__ABSTRACT__', self.summary)
         prompt = prompt.replace('__INTRODUCTION__', introduction)
@@ -171,7 +173,7 @@ def tldr(self) -> str:
         prompt_tokens = enc.encode(prompt)
         prompt_tokens = prompt_tokens[:4000]  # truncate to 4000 tokens
         prompt = enc.decode(prompt_tokens)
-        llm = get_llm()
+
         tldr = llm.generate(
             messages=[
                 {
@@ -190,7 +192,9 @@ def affiliations(self) -> Optional[list[str]]:
             if content is None:
                 content = "\n".join(self.tex.values())
             #search for affiliations
-            match = re.search(r'\\author.*?\\maketitle', content, flags=re.DOTALL)
+            possible_regions = [r'\\author.*?\\maketitle',r'\\begin{document}.*?\\begin{abstract}']
+            matches = [re.search(p, content, flags=re.DOTALL) for p in possible_regions]
+            match = next((m for m in matches if m), None)
             if match:
                 information_region = match.group(0)
             else:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "zotero-arxiv-daily"
-version = "0.3.3"
+version = "0.3.4"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.11"

diff --git a/uv.lock b/uv.lock