feat :增加文件大小对比功能，并在对比检查对比后允许断点续传；默认检查方式转为对比大小。

考虑移除时长检测功能。
tkgs0 · Apr 9, 2024 · 1c2c276 · 1c2c276
1 parent fd274ee
commit 1c2c276
Show file tree

Hide file tree

Showing 4 changed files with 119 additions and 80 deletions.
diff --git a/README.md b/README.md
@@ -24,15 +24,19 @@
 - [x] 文件检查 (通过时长)
 - [x] 错误文件重新下载
 - [x] 支持更多格式 (通过 `ffmpeg` 和 `ffprobe`)
-- [ ] ffmpeg的分析很慢, 寻找更好的方式
+- [x] ffmpeg的分析很慢, 寻找更好的方式（检测大小或许能替代？）
 - [ ] 指定下载路径
 - [ ] 下载文件中途停止记录
-- [ ] 断点续传
+- [x] 断点续传
 - [ ] 下载自动分类配置
 
 
 ## 使用
 
+### 音频大小对比模式
+支持断点续传
+
+### 音频时长分析模式
 不使用 `ffmpeg` 和 `ffprobe` 时仅支持 `mp3` `wav` `flac` 格式的音频分析  
 
 [ffmpeg Documentation](https://www.ffmpeg.org/)  
@@ -46,6 +50,7 @@
 未安装**ffmpeg**时可能会报缺少**libsndfile**等运行库,  
 仍需要另外安装相关依赖.
 
+在使用**checktime**进行时长检测部分mp3内容时,获取的时长差异过大，同时在错误状态下仍然会检测为正常状态（如RJ172342），故默认使用时长检测时仍保留重新下载的模式。
 
 <details>
   <summary>Install ffmpeg or libsndfile</summary>
@@ -100,9 +105,12 @@ pip install -U asmr-spider
 #直接下载, 默认检查重复
 asmr RJ373001 RJ385913
 #或者
-asmr RJ373001 RJ385913 -a check
+asmr RJ373001 RJ385913 -a checksize
 # `asmr` 后面接RJ号, 可输入多个, 使用空格隔开
 
+#通过时长检测重复内容,目前不支持断点续传
+asmr RJ373001 RJ385913 -a checktime
+
 #禁用检查, 跳过已下载的文件
 asmr RJ373001 RJ385913 -a nocheck
 
@@ -120,7 +128,7 @@ from asmr_spider import dload
 
 async def demo():
     args = ['RJ373001', 'RJ385913']
-    action = 'check'  # 'check', 'redownload', 'nocheck'
+    action = 'checksize'  # 'checksize', 'checktime','redownload', 'nocheck'
     await dload(args, action)
 ```
 

diff --git a/asmr_spider/__init__.py b/asmr_spider/__init__.py
@@ -1,7 +1,8 @@
 from .spider import ASMRSpider
 from .config import logger, progress
 from typing import List
-import argparse, shutil
+import argparse
+import shutil
 
 
 parser = argparse.ArgumentParser(description='Spide form asmr.one')
@@ -14,27 +15,31 @@
 
 parser.add_argument(
     '-a', '--action',
-    choices=['check', 'redownload', 'nocheck'],
-    default='check',
-    help='是否检查已下载内容, check检查, redownload重新下载, nocheck跳过已下载内容, 默认check'
+    choices=['checksize', 'checktime', 'redownload', 'nocheck'],
+    default='checksize',
+    help='是否检查已下载内容, checksize对比服务器文件大小,checktime, redownload重新下载, nocheck跳过已下载内容, 默认checksize'
 )
 
 
 async def dload(args: List[str], action):
     try:
-        async with ASMRSpider(check_ffmpeg_installed()) as spider:
+        async with ASMRSpider(check_ffmpeg_installed(action == 'checktime')) as spider:
             for arg in args:
                 await spider.download(str(arg), action)
     except Exception as e:
         logger.exception(e)
         raise e
 
-def check_ffmpeg_installed():
+
+def check_ffmpeg_installed(is_need_check):
+    if not is_need_check:
+        return False
     if shutil.which('ffmpeg') is not None and shutil.which('ffprobe') is not None:
-        logger.warning(f:=f"FFMPEG and FFPROBE 启用, 增加支持的格式。")
+        logger.warning(f := f"FFMPEG and FFPROBE 启用, 增加支持的格式。")
         progress.console.log(f)
         return True
     else:
-        logger.warning(f:=f"FFMPEG 或者 FFPROBE 没有检测到, 将仅支持 MP3、wav、flac 格式的音频.")
+        logger.warning(
+            f := f"FFMPEG 或者 FFPROBE 没有检测到, 将仅支持 MP3、wav、flac 格式的音频.")
         progress.console.log(f)
         return False
diff --git a/asmr_spider/spider.py b/asmr_spider/spider.py
@@ -1,4 +1,5 @@
-import asyncio, soundfile, os
+import asyncio
+import os
 from typing import Any, Dict, List
 try:
     import ujson as json
@@ -8,13 +9,14 @@
 from httpx import AsyncClient
 
 from .config import config, progress, logger
-from pydub import AudioSegment
 
 
 timeout: int = 120
 semaphore: int = 16
 default_audio_exts: tuple = ('.wav', '.flac', '.mp3')
-ffmpeg_audio_exts: tuple = ('.wma', '.ogg', '.m4a', '.ape', '.opus', '.aac', '.mka')
+ffmpeg_audio_exts: tuple = ('.wma', '.ogg', '.m4a',
+                            '.ape', '.opus', '.aac', '.mka')
+
 
 class ASMRSpider:
 
@@ -27,87 +29,121 @@ def __init__(self, support_ffmpeg) -> None:
         }
         self.support_ffmpeg = support_ffmpeg
 
-
     async def login(self) -> None:
         resp = await self.client.post(
             "https://api.asmr.one/api/auth/me",
             json={"name": self.name, "password": self.password},
             headers=self.headers,
-            timeout=timeout
-        )
+            timeout=timeout)
         self.headers |= {
             "Authorization": f"Bearer {(resp.json())['token']}",
         }
 
-
     async def get_voice_info(self, voice_id: str) -> Dict[str, Any]:
         resp = await self.client.get(
             f"https://api.asmr.one/api/work/{voice_id}",
             headers=self.headers,
-            timeout=timeout
-        )
+            timeout=timeout)
         return resp.json()
 
-
     async def get_voice_tracks(self, voice_id):
         resp = await self.client.get(
             f"https://api.asmr.one/api/tracks/{voice_id}",
             headers=self.headers,
-            timeout=timeout
-        )
+            timeout=timeout)
         return resp.json()
 
-
-    def is_bad_file(self, file, file_time):
+    async def check_file_time(self, file_path, file_time):
         try:
             is_bad = False
             duration = 0.0
-            #ffmpeg 查看时长很慢, 所以能使用其他库就不用ffmpeg
-            if os.path.splitext(file)[-1].lower() in default_audio_exts:
-                data = soundfile.SoundFile(file)
+            # ffmpeg 查看时长很慢, 所以能使用其他库就不用ffmpeg
+            if os.path.splitext(file_path)[-1].lower() in default_audio_exts:
+                import soundfile  # 尝试需要时再进行导入
+                data = soundfile.SoundFile(file_path)
                 duration = data.frames/data.samplerate
 
-            elif self.support_ffmpeg and os.path.splitext(file)[-1].lower() in ffmpeg_audio_exts:
-                sound = AudioSegment.from_file(file)
+            elif self.support_ffmpeg and os.path.splitext(file_path)[-1].lower() in ffmpeg_audio_exts:
+                from pydub import AudioSegment
+                sound = AudioSegment.from_file(file_path)
                 duration = sound.duration_seconds
 
             else:
-                logger.info(f := (f"文件跳过检测: {file}"))
+                logger.info(f := (f"文件跳过检测: {file_path}"))
                 progress.console.log(f)
                 return False
 
-            is_bad = (file_time - duration)>0.1
-            logger.info(f :=f"检测文件: {file}, 文件是否完全下载: {not is_bad}\n"
-                                    f"获取时长: {file_time}, 本地时长: {duration}")
+            is_bad = (file_time - duration) > 0.1
+            logger.info(f := f"检测文件: {file_path}, 文件是否完全下载: {not is_bad}\n"
+                        f"获取时长: {file_time}, 本地时长: {duration}")
             progress.console.log(f)
             return is_bad
 
         except Exception as e:
-                print(str(e))
-                logger.exception(e)
-                raise e
-
+            print(str(e))
+            logger.exception(e)
+            raise e
+
+    async def check_file_size(self, url: str, file_size, file_name):
+        is_bad = False
+        temp_headers = self.headers.copy()
+
+        d = {"Accept-Encoding": "identity"}
+        temp_headers.update(d)
+        # 单独获取一次长度
+        async with self.client.stream("HEAD",
+                                      url=url,
+                                      headers=self.headers,
+                                      timeout=timeout) as resp_get_length:
+            if resp_get_length.status_code != 200:
+                return
+            remote_size = -1
+            if resp_get_length.headers.get('Content-Length'):
+                remote_size = int(
+                    resp_get_length.headers.get('Content-Length'))
+            elif resp_get_length.headers.get('x-content-length'):
+                remote_size = int(
+                    resp_get_length.headers.get('x-content-length'))
+
+            is_bad = (remote_size - file_size) > 0 or remote_size == -1
+            logger.info(f := f"检测文件: {file_name}, 文件是否完全下载: {not is_bad}\n"
+                        f"获取大小长: {remote_size}, 本地大小: {file_size}")
+            progress.console.log(f)
+            return is_bad
 
-    async def download_file(self, sem, url: str, save_path: Path, file_name: str, file_time: float) -> None:
-        file_name = file_name.translate(str.maketrans(r'/\:*?"<>|', "_________"))
+    async def download_file(self, sem, url: str, save_path: Path,
+                            file_name: str, file_time) -> None:
+        file_name = file_name.translate(
+            str.maketrans(r'/\:*?"<>|', "_________"))
         file_path = save_path / file_name
-        #筛选是否重新下载
+        temp_headers = self.headers.copy()
+
+        # 本地已经下载的文件大小
+        file_size = (0 if not os.path.exists(file_path) else
+                     os.path.getsize(file_path))
+
+        # 筛选是否重新下载
         is_checked_not_pass = True
+        file_option = 'wb'
         if file_path.exists():
-            if self.checkAction == 'check':
-                is_checked_not_pass = self.is_bad_file(file_path, file_time)
+            if self.checkAction == 'checksize':
+                is_checked_not_pass = await self.check_file_size(url, file_size, file_name)
+                d = {"Range": "bytes=%d-" % file_size}
+                temp_headers.update(d)
+                file_option = 'ab'
+            elif self.checkAction == 'checktime':
+                is_checked_not_pass = await self.check_file_time(file_path, file_time)
             elif self.checkAction == 'nocheck':
-                    is_checked_not_pass = False
+                is_checked_not_pass = False
             elif self.checkAction == 'redownload':
-                    is_checked_not_pass = True
-
-        if not file_path.exists() or is_checked_not_pass:
+                is_checked_not_pass = True
+        if file_size == 0 or is_checked_not_pass:
             async with sem:
-                async with self.client.stream(
-                    "GET", url=url,
-                    headers=self.headers, timeout=timeout
-                ) as resp:
-                    if resp.status_code != 200:
+                async with self.client.stream("GET",
+                                              url=url,
+                                              headers=temp_headers,
+                                              timeout=timeout) as resp:
+                    if resp.status_code != 200 and resp.status_code != 206:
                         logger.error(f := f"{file_path}: {resp.status_code}")
                         progress.console.log(f, style='bold yellow on black')
                         return
@@ -117,41 +153,34 @@ async def download_file(self, sem, url: str, save_path: Path, file_name: str, fi
                         "download",
                         start=True,
                         total=int(total) if total else None,
-                        filename = (
-                            file_name[:4] + "..." + file_name[-4:]
-                            if len(file_name) > 12
-                            else file_name
-                        )
-                    )
-
-                    with open(file_path, 'wb') as fd:  # 写入文件
+                        filename=(file_name[:4] + "..." + file_name[-4:]
+                                  if len(file_name) > 12 else file_name))
+
+                    with open(file_path, file_option) as fd:  # 写入文件
                         async for chunk in resp.aiter_bytes(1024):
-                            fd.write(chunk)
-                            progress.update(
-                                task_id,
-                                advance=len(chunk),
-                            )
+                            if chunk:
+                                fd.write(chunk)
+                                progress.update(
+                                    task_id,
+                                    advance=len(chunk),
+                                )
+                            else:
+                                break
 
                     await asyncio.to_thread(progress.remove_task, task_id)
                     logger.success(f := f"{file_path}: Success.")
                     progress.console.log(f)
 
-
-    async def ensure_dir(self, tracks: List[Dict[str, Any]], root_path: Path) -> None:
+    async def ensure_dir(self, tracks: List[Dict[str, Any]],
+                         root_path: Path) -> None:
         folders: list = [i for i in tracks if i["type"] == "folder"]
         files: list = [i for i in tracks if i["type"] != "folder"]
 
         sem = asyncio.Semaphore(semaphore)
         down: list = []
         for file in files:
-            if "duration" in file:
-                down.append(
-                    self.download_file(sem, file["mediaDownloadUrl"], root_path, file["title"], file["duration"])
-                )
-            else:
-                  down.append(
-                    self.download_file(sem, file["mediaDownloadUrl"], root_path, file["title"], 0)
-                )
+            down.append(
+                self.download_file(sem, file["mediaDownloadUrl"], root_path, file["title"], file_time=file.get("duration")))
         with progress:
             await asyncio.gather(*down)
 
@@ -160,7 +189,6 @@ async def ensure_dir(self, tracks: List[Dict[str, Any]], root_path: Path) -> Non
             new_path.mkdir(parents=True, exist_ok=True)
             await self.ensure_dir(folder["children"], new_path)
 
-
     async def download(self, voice_id: str, action) -> None:
         self.checkAction = action
         voice_id = voice_id.strip().split("RJ")[-1]
@@ -191,12 +219,10 @@ async def download(self, voice_id: str, action) -> None:
         tracks = await self.get_voice_tracks(voice_id)
         await self.ensure_dir(tracks, root)
 
-
     async def __aenter__(self) -> "ASMRSpider":
         self.client = AsyncClient(proxies=config.proxy or None)
         await self.login()
         return self
 
-
     async def __aexit__(self, *args) -> None:
         await self.client.aclose()
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "asmr-spider"
-version = "0.2.1"
+version = "0.2.2"
 description = "asmr.one 音声下载器"
 authors = ["月ヶ瀬"]
 license = "GPL-3.0"