Skip to content

Commit

Permalink
fix invalid tar
Browse files Browse the repository at this point in the history
  • Loading branch information
TideDra committed Dec 24, 2024
1 parent fd8c3b0 commit b3ce1d4
Showing 1 changed file with 65 additions and 59 deletions.
124 changes: 65 additions & 59 deletions paper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from requests.adapters import HTTPAdapter, Retry
from loguru import logger
import tiktoken
from contextlib import ExitStack



Expand Down Expand Up @@ -63,67 +64,72 @@ def code_url(self) -> Optional[str]:

@cached_property
def tex(self) -> dict[str,str]:
with TemporaryDirectory() as tmpdirname:
with ExitStack() as stack:
tmpdirname = stack.enter_context(TemporaryDirectory())
file = self._paper.download_source(dirpath=tmpdirname)
with tarfile.open(file) as tar:
tex_files = [f for f in tar.getnames() if f.endswith('.tex')]
if len(tex_files) == 0:
logger.debug(f"Failed to find main tex file of {self.arxiv_id}: No tex file.")
return None

bbl_file = [f for f in tar.getnames() if f.endswith('.bbl')]
match len(bbl_file) :
case 0:
if len(tex_files) > 1:
logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: There are multiple tex files while no bbl file.")
main_tex = None
else:
main_tex = tex_files[0]
case 1:
main_name = bbl_file[0].replace('.bbl','')
main_tex = f"{main_name}.tex"
if main_tex not in tex_files:
logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: The bbl file does not match any tex file.")
main_tex = None
case _:
logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: There are multiple bbl files.")
try:
tar = stack.enter_context(tarfile.open(file))
except tarfile.ReadError:
logger.debug(f"Failed to find main tex file of {self.arxiv_id}: Not a tar file.")
return None

tex_files = [f for f in tar.getnames() if f.endswith('.tex')]
if len(tex_files) == 0:
logger.debug(f"Failed to find main tex file of {self.arxiv_id}: No tex file.")
return None

bbl_file = [f for f in tar.getnames() if f.endswith('.bbl')]
match len(bbl_file) :
case 0:
if len(tex_files) > 1:
logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: There are multiple tex files while no bbl file.")
main_tex = None

if main_tex is None:
logger.debug(f"Trying to choose tex file containing the document block as main tex file of {self.arxiv_id}")
#read all tex files
file_contents = {}
for t in tex_files:
f = tar.extractfile(t)
content = f.read().decode('utf-8')
#remove comments
content = re.sub(r'%.*\n', '\n', content)
content = re.sub(r'\\begin{comment}.*?\\end{comment}', '', content, flags=re.DOTALL)
content = re.sub(r'\\iffalse.*?\\fi', '', content, flags=re.DOTALL)
#remove redundant \n
content = re.sub(r'\n+', '\n', content)
content = re.sub(r'\\\\', '', content)
#remove consecutive spaces
content = re.sub(r'[ \t\r\f]{3,}', ' ', content)
if main_tex is None and re.search(r'\\begin\{document\}', content):
main_tex = t
logger.debug(f"Choose {t} as main tex file of {self.arxiv_id}")
file_contents[t] = content

if main_tex is not None:
main_source:str = file_contents[main_tex]
#find and replace all included sub-files
include_files = re.findall(r'\\input\{(.+?)\}', main_source) + re.findall(r'\\include\{(.+?)\}', main_source)
for f in include_files:
if not f.endswith('.tex'):
file_name = f + '.tex'
else:
file_name = f
main_source = main_source.replace(f'\\input{{{f}}}', file_contents.get(file_name, ''))
file_contents["all"] = main_source
else:
logger.debug(f"Failed to find main tex file of {self.arxiv_id}: No tex file containing the document block.")
file_contents["all"] = None
else:
main_tex = tex_files[0]
case 1:
main_name = bbl_file[0].replace('.bbl','')
main_tex = f"{main_name}.tex"
if main_tex not in tex_files:
logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: The bbl file does not match any tex file.")
main_tex = None
case _:
logger.debug(f"Cannot find main tex file of {self.arxiv_id} from bbl: There are multiple bbl files.")
main_tex = None
if main_tex is None:
logger.debug(f"Trying to choose tex file containing the document block as main tex file of {self.arxiv_id}")
#read all tex files
file_contents = {}
for t in tex_files:
f = tar.extractfile(t)
content = f.read().decode('utf-8')
#remove comments
content = re.sub(r'%.*\n', '\n', content)
content = re.sub(r'\\begin{comment}.*?\\end{comment}', '', content, flags=re.DOTALL)
content = re.sub(r'\\iffalse.*?\\fi', '', content, flags=re.DOTALL)
#remove redundant \n
content = re.sub(r'\n+', '\n', content)
content = re.sub(r'\\\\', '', content)
#remove consecutive spaces
content = re.sub(r'[ \t\r\f]{3,}', ' ', content)
if main_tex is None and re.search(r'\\begin\{document\}', content):
main_tex = t
logger.debug(f"Choose {t} as main tex file of {self.arxiv_id}")
file_contents[t] = content

if main_tex is not None:
main_source:str = file_contents[main_tex]
#find and replace all included sub-files
include_files = re.findall(r'\\input\{(.+?)\}', main_source) + re.findall(r'\\include\{(.+?)\}', main_source)
for f in include_files:
if not f.endswith('.tex'):
file_name = f + '.tex'
else:
file_name = f
main_source = main_source.replace(f'\\input{{{f}}}', file_contents.get(file_name, ''))
file_contents["all"] = main_source
else:
logger.debug(f"Failed to find main tex file of {self.arxiv_id}: No tex file containing the document block.")
file_contents["all"] = None
return file_contents

@cached_property
Expand Down

0 comments on commit b3ce1d4

Please sign in to comment.