fix invalid tar

hbzcnc · Dec 24, 2024 · b3ce1d4 · b3ce1d4
1 parent fd8c3b0
commit b3ce1d4
Showing 1 changed file with 65 additions and 59 deletions.
diff --git a/paper.py b/paper.py
@@ -9,6 +9,7 @@
 from requests.adapters import HTTPAdapter, Retry
 from loguru import logger
 import tiktoken
+from contextlib import ExitStack
 
 
 
@@ -63,67 +64,72 @@ def code_url(self) -> Optional[str]:
 
     @cached_property
     def tex(self) -> dict[str,str]:
-        with TemporaryDirectory() as tmpdirname:
+        with ExitStack() as stack:
+            tmpdirname = stack.enter_context(TemporaryDirectory())
             file = self._paper.download_source(dirpath=tmpdirname)
-            with tarfile.open(file) as tar:
-                tex_files = [f for f in tar.getnames() if f.endswith('.tex')]
-                if len(tex_files) == 0:
-                    logger.debug(f"Failed to find main tex file of {self.arxiv_id}: No tex file.")
-                    return None
-
-                bbl_file = [f for f in tar.getnames() if f.endswith('.bbl')]
-                match len(bbl_file) :
-                    case 0:
-                        if len(tex_files) > 1:
-                            logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: There are multiple tex files while no bbl file.")
-                            main_tex = None
-                        else:
-                            main_tex = tex_files[0]
-                    case 1:
-                        main_name = bbl_file[0].replace('.bbl','')
-                        main_tex = f"{main_name}.tex"
-                        if main_tex not in tex_files:
-                            logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: The bbl file does not match any tex file.")
-                            main_tex = None
-                    case _:
-                        logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: There are multiple bbl files.")
+            try:
+                tar = stack.enter_context(tarfile.open(file))
+            except tarfile.ReadError:
+                logger.debug(f"Failed to find main tex file of {self.arxiv_id}: Not a tar file.")
+                return None
+
+            tex_files = [f for f in tar.getnames() if f.endswith('.tex')]
+            if len(tex_files) == 0:
+                logger.debug(f"Failed to find main tex file of {self.arxiv_id}: No tex file.")
+                return None
+
+            bbl_file = [f for f in tar.getnames() if f.endswith('.bbl')]
+            match len(bbl_file) :
+                case 0:
+                    if len(tex_files) > 1:
+                        logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: There are multiple tex files while no bbl file.")
                         main_tex = None
-
-                if main_tex is None:
-                    logger.debug(f"Trying to choose tex file containing the document block as main tex file of {self.arxiv_id}")
-                #read all tex files
-                file_contents = {}
-                for t in tex_files:
-                    f = tar.extractfile(t)
-                    content = f.read().decode('utf-8')
-                    #remove comments
-                    content = re.sub(r'%.*\n', '\n', content)
-                    content = re.sub(r'\\begin{comment}.*?\\end{comment}', '', content, flags=re.DOTALL)
-                    content = re.sub(r'\\iffalse.*?\\fi', '', content, flags=re.DOTALL)
-                    #remove redundant \n
-                    content = re.sub(r'\n+', '\n', content)
-                    content = re.sub(r'\\\\', '', content)
-                    #remove consecutive spaces
-                    content = re.sub(r'[ \t\r\f]{3,}', ' ', content)
-                    if main_tex is None and re.search(r'\\begin\{document\}', content):
-                        main_tex = t
-                        logger.debug(f"Choose {t} as main tex file of {self.arxiv_id}")
-                    file_contents[t] = content
-
-                if main_tex is not None:
-                    main_source:str = file_contents[main_tex]
-                    #find and replace all included sub-files
-                    include_files = re.findall(r'\\input\{(.+?)\}', main_source) + re.findall(r'\\include\{(.+?)\}', main_source)
-                    for f in include_files:
-                        if not f.endswith('.tex'):
-                            file_name = f + '.tex'
-                        else:
-                            file_name = f
-                        main_source = main_source.replace(f'\\input{{{f}}}', file_contents.get(file_name, ''))
-                    file_contents["all"] = main_source
-                else:
-                    logger.debug(f"Failed to find main tex file of {self.arxiv_id}: No tex file containing the document block.")
-                    file_contents["all"] = None
+                    else:
+                        main_tex = tex_files[0]
+                case 1:
+                    main_name = bbl_file[0].replace('.bbl','')
+                    main_tex = f"{main_name}.tex"
+                    if main_tex not in tex_files:
+                        logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: The bbl file does not match any tex file.")
+                        main_tex = None
+                case _:
+                    logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: There are multiple bbl files.")
+                    main_tex = None
+            if main_tex is None:
+                logger.debug(f"Trying to choose tex file containing the document block as main tex file of {self.arxiv_id}")
+            #read all tex files
+            file_contents = {}
+            for t in tex_files:
+                f = tar.extractfile(t)
+                content = f.read().decode('utf-8')
+                #remove comments
+                content = re.sub(r'%.*\n', '\n', content)
+                content = re.sub(r'\\begin{comment}.*?\\end{comment}', '', content, flags=re.DOTALL)
+                content = re.sub(r'\\iffalse.*?\\fi', '', content, flags=re.DOTALL)
+                #remove redundant \n
+                content = re.sub(r'\n+', '\n', content)
+                content = re.sub(r'\\\\', '', content)
+                #remove consecutive spaces
+                content = re.sub(r'[ \t\r\f]{3,}', ' ', content)
+                if main_tex is None and re.search(r'\\begin\{document\}', content):
+                    main_tex = t
+                    logger.debug(f"Choose {t} as main tex file of {self.arxiv_id}")
+                file_contents[t] = content
+
+            if main_tex is not None:
+                main_source:str = file_contents[main_tex]
+                #find and replace all included sub-files
+                include_files = re.findall(r'\\input\{(.+?)\}', main_source) + re.findall(r'\\include\{(.+?)\}', main_source)
+                for f in include_files:
+                    if not f.endswith('.tex'):
+                        file_name = f + '.tex'
+                    else:
+                        file_name = f
+                    main_source = main_source.replace(f'\\input{{{f}}}', file_contents.get(file_name, ''))
+                file_contents["all"] = main_source
+            else:
+                logger.debug(f"Failed to find main tex file of {self.arxiv_id}: No tex file containing the document block.")
+                file_contents["all"] = None
         return file_contents
 
     @cached_property