Skip to content

Commit

Permalink
feat :增加文件大小对比功能,并在对比检查对比后允许断点续传;默认检查方式转为对比大小。
Browse files Browse the repository at this point in the history
考虑移除时长检测功能。
  • Loading branch information
c2879351010 committed Apr 9, 2024
1 parent fd274ee commit 1c2c276
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 80 deletions.
16 changes: 12 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,19 @@
- [x] 文件检查 (通过时长)
- [x] 错误文件重新下载
- [x] 支持更多格式 (通过 `ffmpeg``ffprobe`)
- [ ] ffmpeg的分析很慢, 寻找更好的方式
- [x] ffmpeg的分析很慢, 寻找更好的方式(检测大小或许能替代?)
- [ ] 指定下载路径
- [ ] 下载文件中途停止记录
- [ ] 断点续传
- [x] 断点续传
- [ ] 下载自动分类配置


## 使用

### 音频大小对比模式
支持断点续传

### 音频时长分析模式
不使用 `ffmpeg``ffprobe` 时仅支持 `mp3` `wav` `flac` 格式的音频分析

[ffmpeg Documentation](https://www.ffmpeg.org/)
Expand All @@ -46,6 +50,7 @@
未安装**ffmpeg**时可能会报缺少**libsndfile**等运行库,
仍需要另外安装相关依赖.

在使用**checktime**进行时长检测部分mp3内容时,获取的时长差异过大,同时在错误状态下仍然会检测为正常状态(如RJ172342),故默认使用时长检测时仍保留重新下载的模式。

<details>
<summary>Install ffmpeg or libsndfile</summary>
Expand Down Expand Up @@ -100,9 +105,12 @@ pip install -U asmr-spider
#直接下载, 默认检查重复
asmr RJ373001 RJ385913
#或者
asmr RJ373001 RJ385913 -a check
asmr RJ373001 RJ385913 -a checksize
# `asmr` 后面接RJ号, 可输入多个, 使用空格隔开

#通过时长检测重复内容,目前不支持断点续传
asmr RJ373001 RJ385913 -a checktime

#禁用检查, 跳过已下载的文件
asmr RJ373001 RJ385913 -a nocheck

Expand All @@ -120,7 +128,7 @@ from asmr_spider import dload

async def demo():
args = ['RJ373001', 'RJ385913']
action = 'check' # 'check', 'redownload', 'nocheck'
action = 'checksize' # 'checksize', 'checktime','redownload', 'nocheck'
await dload(args, action)
```

Expand Down
21 changes: 13 additions & 8 deletions asmr_spider/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from .spider import ASMRSpider
from .config import logger, progress
from typing import List
import argparse, shutil
import argparse
import shutil


parser = argparse.ArgumentParser(description='Spide form asmr.one')
Expand All @@ -14,27 +15,31 @@

parser.add_argument(
'-a', '--action',
choices=['check', 'redownload', 'nocheck'],
default='check',
help='是否检查已下载内容, check检查, redownload重新下载, nocheck跳过已下载内容, 默认check'
choices=['checksize', 'checktime', 'redownload', 'nocheck'],
default='checksize',
help='是否检查已下载内容, checksize对比服务器文件大小,checktime, redownload重新下载, nocheck跳过已下载内容, 默认checksize'
)


async def dload(args: List[str], action):
try:
async with ASMRSpider(check_ffmpeg_installed()) as spider:
async with ASMRSpider(check_ffmpeg_installed(action == 'checktime')) as spider:
for arg in args:
await spider.download(str(arg), action)
except Exception as e:
logger.exception(e)
raise e

def check_ffmpeg_installed():

def check_ffmpeg_installed(is_need_check):
if not is_need_check:
return False
if shutil.which('ffmpeg') is not None and shutil.which('ffprobe') is not None:
logger.warning(f:=f"FFMPEG and FFPROBE 启用, 增加支持的格式。")
logger.warning(f := f"FFMPEG and FFPROBE 启用, 增加支持的格式。")
progress.console.log(f)
return True
else:
logger.warning(f:=f"FFMPEG 或者 FFPROBE 没有检测到, 将仅支持 MP3、wav、flac 格式的音频.")
logger.warning(
f := f"FFMPEG 或者 FFPROBE 没有检测到, 将仅支持 MP3、wav、flac 格式的音频.")
progress.console.log(f)
return False
160 changes: 93 additions & 67 deletions asmr_spider/spider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio, soundfile, os
import asyncio
import os
from typing import Any, Dict, List
try:
import ujson as json
Expand All @@ -8,13 +9,14 @@
from httpx import AsyncClient

from .config import config, progress, logger
from pydub import AudioSegment


timeout: int = 120
semaphore: int = 16
default_audio_exts: tuple = ('.wav', '.flac', '.mp3')
ffmpeg_audio_exts: tuple = ('.wma', '.ogg', '.m4a', '.ape', '.opus', '.aac', '.mka')
ffmpeg_audio_exts: tuple = ('.wma', '.ogg', '.m4a',
'.ape', '.opus', '.aac', '.mka')


class ASMRSpider:

Expand All @@ -27,87 +29,121 @@ def __init__(self, support_ffmpeg) -> None:
}
self.support_ffmpeg = support_ffmpeg


async def login(self) -> None:
resp = await self.client.post(
"https://api.asmr.one/api/auth/me",
json={"name": self.name, "password": self.password},
headers=self.headers,
timeout=timeout
)
timeout=timeout)
self.headers |= {
"Authorization": f"Bearer {(resp.json())['token']}",
}


async def get_voice_info(self, voice_id: str) -> Dict[str, Any]:
resp = await self.client.get(
f"https://api.asmr.one/api/work/{voice_id}",
headers=self.headers,
timeout=timeout
)
timeout=timeout)
return resp.json()


async def get_voice_tracks(self, voice_id):
resp = await self.client.get(
f"https://api.asmr.one/api/tracks/{voice_id}",
headers=self.headers,
timeout=timeout
)
timeout=timeout)
return resp.json()


def is_bad_file(self, file, file_time):
async def check_file_time(self, file_path, file_time):
try:
is_bad = False
duration = 0.0
#ffmpeg 查看时长很慢, 所以能使用其他库就不用ffmpeg
if os.path.splitext(file)[-1].lower() in default_audio_exts:
data = soundfile.SoundFile(file)
# ffmpeg 查看时长很慢, 所以能使用其他库就不用ffmpeg
if os.path.splitext(file_path)[-1].lower() in default_audio_exts:
import soundfile # 尝试需要时再进行导入
data = soundfile.SoundFile(file_path)
duration = data.frames/data.samplerate

elif self.support_ffmpeg and os.path.splitext(file)[-1].lower() in ffmpeg_audio_exts:
sound = AudioSegment.from_file(file)
elif self.support_ffmpeg and os.path.splitext(file_path)[-1].lower() in ffmpeg_audio_exts:
from pydub import AudioSegment
sound = AudioSegment.from_file(file_path)
duration = sound.duration_seconds

else:
logger.info(f := (f"文件跳过检测: {file}"))
logger.info(f := (f"文件跳过检测: {file_path}"))
progress.console.log(f)
return False

is_bad = (file_time - duration)>0.1
logger.info(f :=f"检测文件: {file}, 文件是否完全下载: {not is_bad}\n"
f"获取时长: {file_time}, 本地时长: {duration}")
is_bad = (file_time - duration) > 0.1
logger.info(f := f"检测文件: {file_path}, 文件是否完全下载: {not is_bad}\n"
f"获取时长: {file_time}, 本地时长: {duration}")
progress.console.log(f)
return is_bad

except Exception as e:
print(str(e))
logger.exception(e)
raise e

print(str(e))
logger.exception(e)
raise e

async def check_file_size(self, url: str, file_size, file_name):
is_bad = False
temp_headers = self.headers.copy()

d = {"Accept-Encoding": "identity"}
temp_headers.update(d)
# 单独获取一次长度
async with self.client.stream("HEAD",
url=url,
headers=self.headers,
timeout=timeout) as resp_get_length:
if resp_get_length.status_code != 200:
return
remote_size = -1
if resp_get_length.headers.get('Content-Length'):
remote_size = int(
resp_get_length.headers.get('Content-Length'))
elif resp_get_length.headers.get('x-content-length'):
remote_size = int(
resp_get_length.headers.get('x-content-length'))

is_bad = (remote_size - file_size) > 0 or remote_size == -1
logger.info(f := f"检测文件: {file_name}, 文件是否完全下载: {not is_bad}\n"
f"获取大小长: {remote_size}, 本地大小: {file_size}")
progress.console.log(f)
return is_bad

async def download_file(self, sem, url: str, save_path: Path, file_name: str, file_time: float) -> None:
file_name = file_name.translate(str.maketrans(r'/\:*?"<>|', "_________"))
async def download_file(self, sem, url: str, save_path: Path,
file_name: str, file_time) -> None:
file_name = file_name.translate(
str.maketrans(r'/\:*?"<>|', "_________"))
file_path = save_path / file_name
#筛选是否重新下载
temp_headers = self.headers.copy()

# 本地已经下载的文件大小
file_size = (0 if not os.path.exists(file_path) else
os.path.getsize(file_path))

# 筛选是否重新下载
is_checked_not_pass = True
file_option = 'wb'
if file_path.exists():
if self.checkAction == 'check':
is_checked_not_pass = self.is_bad_file(file_path, file_time)
if self.checkAction == 'checksize':
is_checked_not_pass = await self.check_file_size(url, file_size, file_name)
d = {"Range": "bytes=%d-" % file_size}
temp_headers.update(d)
file_option = 'ab'
elif self.checkAction == 'checktime':
is_checked_not_pass = await self.check_file_time(file_path, file_time)
elif self.checkAction == 'nocheck':
is_checked_not_pass = False
is_checked_not_pass = False
elif self.checkAction == 'redownload':
is_checked_not_pass = True

if not file_path.exists() or is_checked_not_pass:
is_checked_not_pass = True
if file_size == 0 or is_checked_not_pass:
async with sem:
async with self.client.stream(
"GET", url=url,
headers=self.headers, timeout=timeout
) as resp:
if resp.status_code != 200:
async with self.client.stream("GET",
url=url,
headers=temp_headers,
timeout=timeout) as resp:
if resp.status_code != 200 and resp.status_code != 206:
logger.error(f := f"{file_path}: {resp.status_code}")
progress.console.log(f, style='bold yellow on black')
return
Expand All @@ -117,41 +153,34 @@ async def download_file(self, sem, url: str, save_path: Path, file_name: str, fi
"download",
start=True,
total=int(total) if total else None,
filename = (
file_name[:4] + "..." + file_name[-4:]
if len(file_name) > 12
else file_name
)
)

with open(file_path, 'wb') as fd: # 写入文件
filename=(file_name[:4] + "..." + file_name[-4:]
if len(file_name) > 12 else file_name))

with open(file_path, file_option) as fd: # 写入文件
async for chunk in resp.aiter_bytes(1024):
fd.write(chunk)
progress.update(
task_id,
advance=len(chunk),
)
if chunk:
fd.write(chunk)
progress.update(
task_id,
advance=len(chunk),
)
else:
break

await asyncio.to_thread(progress.remove_task, task_id)
logger.success(f := f"{file_path}: Success.")
progress.console.log(f)


async def ensure_dir(self, tracks: List[Dict[str, Any]], root_path: Path) -> None:
async def ensure_dir(self, tracks: List[Dict[str, Any]],
root_path: Path) -> None:
folders: list = [i for i in tracks if i["type"] == "folder"]
files: list = [i for i in tracks if i["type"] != "folder"]

sem = asyncio.Semaphore(semaphore)
down: list = []
for file in files:
if "duration" in file:
down.append(
self.download_file(sem, file["mediaDownloadUrl"], root_path, file["title"], file["duration"])
)
else:
down.append(
self.download_file(sem, file["mediaDownloadUrl"], root_path, file["title"], 0)
)
down.append(
self.download_file(sem, file["mediaDownloadUrl"], root_path, file["title"], file_time=file.get("duration")))
with progress:
await asyncio.gather(*down)

Expand All @@ -160,7 +189,6 @@ async def ensure_dir(self, tracks: List[Dict[str, Any]], root_path: Path) -> Non
new_path.mkdir(parents=True, exist_ok=True)
await self.ensure_dir(folder["children"], new_path)


async def download(self, voice_id: str, action) -> None:
self.checkAction = action
voice_id = voice_id.strip().split("RJ")[-1]
Expand Down Expand Up @@ -191,12 +219,10 @@ async def download(self, voice_id: str, action) -> None:
tracks = await self.get_voice_tracks(voice_id)
await self.ensure_dir(tracks, root)


async def __aenter__(self) -> "ASMRSpider":
self.client = AsyncClient(proxies=config.proxy or None)
await self.login()
return self


async def __aexit__(self, *args) -> None:
await self.client.aclose()
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "asmr-spider"
version = "0.2.1"
version = "0.2.2"
description = "asmr.one 音声下载器"
authors = ["月ヶ瀬"]
license = "GPL-3.0"
Expand Down

0 comments on commit 1c2c276

Please sign in to comment.