Skip to content

Commit

Permalink
Merge pull request #37 from iscc/develop
Browse files Browse the repository at this point in the history
Version 0.9.6
  • Loading branch information
titusz authored Apr 24, 2020
2 parents f3c8d23 + e01a3ce commit ac5e450
Show file tree
Hide file tree
Showing 31 changed files with 394 additions and 125 deletions.
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ Options:
Commands:
gen* Generate ISCC Code for FILE.
batch Create ISCC Codes for all files in PATH.
dump Dump Tika extraction results for FILE.
dump Dump Tika extraction results for PATH (file or url path).
info Show information about environment.
init Inititalize and check environment.
sim Estimate Similarity of ISCC Codes A & B.
Expand Down Expand Up @@ -199,10 +199,17 @@ You may also want join our developer chat on Telegram at <https://t.me/iscc_dev>

## Change Log

### [0.9.6] - 2020-04-24
- Support urls with dump command
- Updated tika 1.24 and fpcalc 1.50
- Use filename for meta-id as last resort
- Switch to signed audio fingerprint (breaking change)
- Bugfixes and stability improvements

### [0.9.5] - 2020-03-02
- Support mobi7
- Support mobi print replica
- Support mobi wit web command
- Support mobi with web command

### [0.9.4] - 2020-03-02
- Add experimental support for mobi files
Expand Down
9 changes: 7 additions & 2 deletions iscc_cli/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
import iscc_cli.monkeys
import os
import click
from tika import tika

__version__ = "0.9.5"

__version__ = "0.9.6"
APP_NAME = "iscc-cli"
APP_DIR = click.get_app_dir(APP_NAME, roaming=False)
os.makedirs(iscc_cli.APP_DIR, exist_ok=True)
os.environ["TIKA_PATH"] = APP_DIR
os.environ["TIKA_LOG_PATH"] = APP_DIR
os.environ["TIKA_VERSION"] = "1.24"
os.environ["LOGURU_AUTOINIT"] = "False"


from tika import tika

tika.log.disabled = True
4 changes: 2 additions & 2 deletions iscc_cli/audio_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ def get_chroma_vector(file):

if hasattr(file, "read"):
file.seek(0)
cmd = [fpcalc.exe_path(), "-raw", "-json", "-"]
cmd = [fpcalc.exe_path(), "-raw", "-json", "-signed", "-"]
res = subprocess.run(cmd, stdout=subprocess.PIPE, input=file.read())
else:
cmd = [fpcalc.exe_path(), file, "-raw", "-json"]
cmd = [fpcalc.exe_path(), "-raw", "-json", "-signed", file]
res = subprocess.run(cmd, stdout=subprocess.PIPE)

vec = json.loads(res.stdout.decode("utf-8"))["fingerprint"]
Expand Down
3 changes: 2 additions & 1 deletion iscc_cli/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import click
from iscc_cli import __version__, init, gen, batch, sim, info, web, dump, test
from iscc_cli import __version__
from iscc_cli.commands import init, gen, batch, sim, info, web, dump, test
from click_default_group import DefaultGroup


Expand Down
Empty file added iscc_cli/commands/__init__.py
Empty file.
14 changes: 12 additions & 2 deletions iscc_cli/batch.py → iscc_cli/commands/batch.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
import os
import shutil
from os.path import basename, abspath
import click
Expand Down Expand Up @@ -32,6 +33,11 @@ def batch(path, recursive, guess):
"""
results = []
for f in get_files(path, recursive=recursive):
filesize = os.path.getsize(f)
if not filesize:
click.echo("Cannot proccess empty file: {}".format(f))
continue

media_type = detector.from_file(f)
if media_type not in SUPPORTED_MIME_TYPES:
fname = basename(f)
Expand All @@ -54,12 +60,16 @@ def batch(path, recursive, guess):
else:
tika_result = parser.from_file(f)

title = get_title(tika_result, guess=guess)
title = get_title(tika_result, guess=guess, uri=f)

mid, norm_title, _ = iscc.meta_id(title)
gmt = mime_to_gmt(media_type, file_path=f)
if gmt == GMT.IMAGE:
cid = iscc.content_id_image(f)
try:
cid = iscc.content_id_image(f)
except Exception as e:
click.echo("Clould not proccess image: {} ({})".format(f, e))

elif gmt == GMT.TEXT:
text = tika_result["content"]
if not text:
Expand Down
13 changes: 7 additions & 6 deletions iscc_cli/dump.py → iscc_cli/commands/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,26 +11,27 @@


@click.command(cls=DefaultHelp)
@click.argument("file", type=click.File("rb"))
@click.argument("path", type=click.STRING)
@click.option(
"-s", "--strip", type=click.INT, default=0, help="Strip content to first X chars."
)
@click.option("-m", "--meta", is_flag=True, default=False, help="Dump metadata only.")
@click.option("-c", "--content", is_flag=True, default=False, help="Dump content only.")
def dump(file, strip, meta, content):
"""Dump Tika extraction results for FILE."""
def dump(path, strip, meta, content):
"""Dump Tika extraction results for PATH (file or url path)."""

media_type = detector.from_file(path)

media_type = detector.from_file(file.name)
if media_type not in SUPPORTED_MIME_TYPES:
click.echo("Unsupported media type {}.".format(media_type))
click.echo("Please request support at https://github.com/iscc/iscc-cli/issues")

if media_type == "application/x-mobipocket-ebook":
tempdir, epub_filepath = mobi.extract(file.name)
tempdir, epub_filepath = mobi.extract(path)
tika_result = parser.from_file(epub_filepath)
shutil.rmtree(tempdir)
else:
tika_result = parser.from_file(file.name)
tika_result = parser.from_file(path)

if all([meta, content]):
raise UsageError("Use either --meta or --content for selective output.")
Expand Down
7 changes: 6 additions & 1 deletion iscc_cli/gen.py → iscc_cli/commands/gen.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
import os
import shutil
from os.path import abspath

Expand Down Expand Up @@ -28,6 +29,10 @@
@click.option("-v", "--verbose", is_flag=True, help="Enables verbose mode.")
def gen(file, guess, title, extra, verbose):
"""Generate ISCC Code for FILE."""
filesize = os.path.getsize(file.name)
if not filesize:
raise click.BadParameter("Cannot proccess empty file: {}".format(file.name))

media_type = detector.from_file(file.name)
if media_type not in SUPPORTED_MIME_TYPES:
click.echo("Unsupported media type {}.".format(media_type))
Expand All @@ -41,7 +46,7 @@ def gen(file, guess, title, extra, verbose):
tika_result = parser.from_file(file.name)

if not title:
title = get_title(tika_result, guess=guess)
title = get_title(tika_result, guess=guess, uri=file.name)

if not extra:
extra = ""
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion iscc_cli/web.py → iscc_cli/commands/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def web(url, guess, title, extra, verbose):
tika_result = parser.from_buffer(data)

if not title:
title = get_title(tika_result, guess=guess)
title = get_title(tika_result, guess=guess, uri=url)

mid, norm_title, _ = iscc.meta_id(title, extra)
gmt = mime_to_gmt(media_type)
Expand Down
11 changes: 6 additions & 5 deletions iscc_cli/fpcalc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from iscc_cli.utils import download_file


FPCALC_VERSION = "1.4.3"
FPCALC_VERSION = "1.5.0"
FPCALC_URL_BASE = "https://github.com/acoustid/chromaprint/releases/download/v{}/".format(
FPCALC_VERSION
)
Expand All @@ -24,10 +24,10 @@


def exe_path():
"""Returns patth to fpcalc executable."""
"""Returns path to fpcalc executable."""
if platform.system() == "Windows":
return os.path.join(iscc_cli.APP_DIR, "fpcalc.exe")
return os.path.join(iscc_cli.APP_DIR, "fpcalc")
return os.path.join(iscc_cli.APP_DIR, "fpcalc-{}.exe".format(FPCALC_VERSION))
return os.path.join(iscc_cli.APP_DIR, "fpcalc-{}".format(FPCALC_VERSION))


def is_installed():
Expand All @@ -47,6 +47,7 @@ def download():


def extract(archive):
"""Extract archive with fpcalc executable."""
if archive.endswith(".zip"):
with zipfile.ZipFile(archive, "r") as zip_file:
for member in zip_file.namelist():
Expand Down Expand Up @@ -83,6 +84,6 @@ def get_version_info():
"""Get fpcalc version"""
try:
r = subprocess.run([exe_path(), "-v"], stdout=subprocess.PIPE)
return r.stdout.decode("utf-8").strip().split()[-1]
return r.stdout.decode("utf-8").strip().split()[2]
except FileNotFoundError:
return 'WARNING: Not Installed - run "iscc init" to install!'
6 changes: 3 additions & 3 deletions iscc_cli/lib.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-
"""Expose cli commands with standard python api."""
from typing import List, Dict
from iscc_cli.gen import gen
from iscc_cli.batch import batch
from iscc_cli.web import web
from iscc_cli.commands.gen import gen
from iscc_cli.commands.batch import batch
from iscc_cli.commands.web import web


def iscc_from_file(file, guess=False, title="", extra="") -> Dict:
Expand Down
13 changes: 10 additions & 3 deletions iscc_cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import re
import textwrap
from os import getcwd, listdir, walk
from os.path import isfile, splitext, isdir, join
from os.path import isfile, splitext, isdir, join, basename
from urllib.parse import urlparse
import click
import iscc
Expand Down Expand Up @@ -76,15 +76,16 @@ def mime_to_gmt(mime_type, file_path=None):
return gmt


def get_title(tika_result: dict, guess=False):
def get_title(tika_result: dict, guess=False, uri=None):
title = ""

meta = tika_result.get("metadata")
if meta:
title = meta.get("dc:title", "")
title = title[0].strip() if isinstance(title, list) else title.strip()
if not title:
title = meta.get("title", "").strip()
title = meta.get("title", "")
title = title[0].strip() if isinstance(title, list) else title.strip()

# See if string would survive normalization
norm_title = iscc.text_normalize(title, keep_ws=True)
Expand All @@ -95,6 +96,12 @@ def get_title(tika_result: dict, guess=False):
first_line = content.strip().splitlines()[0]
title = iscc.text_trim(iscc.text_normalize(first_line, keep_ws=True))

if not title and uri is not None:
result = urlparse(uri)
base = basename(result.path)
title = splitext(base)[0]
title = title.replace("-", " ")
title = title.replace("_", " ")
return title


Expand Down
Loading

0 comments on commit ac5e450

Please sign in to comment.