Merge pull request #37 from iscc/develop

Version 0.9.6
iscc · Apr 24, 2020 · ac5e450 · ac5e450
2 parents f3c8d23 + e01a3ce
commit ac5e450
Show file tree

Hide file tree

Showing 31 changed files with 394 additions and 125 deletions.
diff --git a/README.md b/README.md
@@ -89,7 +89,7 @@ Options:
 Commands:
   gen*   Generate ISCC Code for FILE.
   batch  Create ISCC Codes for all files in PATH.
-  dump   Dump Tika extraction results for FILE.
+  dump   Dump Tika extraction results for PATH (file or url path).
   info   Show information about environment.
   init   Inititalize and check environment.
   sim    Estimate Similarity of ISCC Codes A & B.
@@ -199,10 +199,17 @@ You may also want join our developer chat on Telegram at <https://t.me/iscc_dev>
 
 ## Change Log
 
+### [0.9.6] - 2020-04-24
+- Support urls with dump command
+- Updated tika 1.24 and fpcalc 1.50
+- Use filename for meta-id as last resort
+- Switch to signed audio fingerprint (breaking change)
+- Bugfixes and stability improvements
+
 ### [0.9.5] - 2020-03-02
 - Support mobi7
 - Support mobi print replica
-- Support mobi wit web command
+- Support mobi with web command
 
 ### [0.9.4] - 2020-03-02
 - Add experimental support for mobi files

diff --git a/iscc_cli/__init__.py b/iscc_cli/__init__.py
@@ -1,13 +1,18 @@
 import iscc_cli.monkeys
 import os
 import click
-from tika import tika
 
-__version__ = "0.9.5"
 
+__version__ = "0.9.6"
 APP_NAME = "iscc-cli"
 APP_DIR = click.get_app_dir(APP_NAME, roaming=False)
 os.makedirs(iscc_cli.APP_DIR, exist_ok=True)
 os.environ["TIKA_PATH"] = APP_DIR
+os.environ["TIKA_LOG_PATH"] = APP_DIR
+os.environ["TIKA_VERSION"] = "1.24"
 os.environ["LOGURU_AUTOINIT"] = "False"
+
+
+from tika import tika
+
 tika.log.disabled = True
diff --git a/iscc_cli/audio_id.py b/iscc_cli/audio_id.py
@@ -22,10 +22,10 @@ def get_chroma_vector(file):
 
     if hasattr(file, "read"):
         file.seek(0)
-        cmd = [fpcalc.exe_path(), "-raw", "-json", "-"]
+        cmd = [fpcalc.exe_path(), "-raw", "-json", "-signed", "-"]
         res = subprocess.run(cmd, stdout=subprocess.PIPE, input=file.read())
     else:
-        cmd = [fpcalc.exe_path(), file, "-raw", "-json"]
+        cmd = [fpcalc.exe_path(), "-raw", "-json", "-signed", file]
         res = subprocess.run(cmd, stdout=subprocess.PIPE)
 
     vec = json.loads(res.stdout.decode("utf-8"))["fingerprint"]

diff --git a/iscc_cli/cli.py b/iscc_cli/cli.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 import click
-from iscc_cli import __version__, init, gen, batch, sim, info, web, dump, test
+from iscc_cli import __version__
+from iscc_cli.commands import init, gen, batch, sim, info, web, dump, test
 from click_default_group import DefaultGroup
 
 

diff --git a/iscc_cli/commands/__init__.py b/iscc_cli/commands/__init__.py
diff --git a/iscc_cli/batch.py → iscc_cli/commands/batch.py b/iscc_cli/batch.py → iscc_cli/commands/batch.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+import os
 import shutil
 from os.path import basename, abspath
 import click
@@ -32,6 +33,11 @@ def batch(path, recursive, guess):
     """
     results = []
     for f in get_files(path, recursive=recursive):
+        filesize = os.path.getsize(f)
+        if not filesize:
+            click.echo("Cannot proccess empty file: {}".format(f))
+            continue
+
         media_type = detector.from_file(f)
         if media_type not in SUPPORTED_MIME_TYPES:
             fname = basename(f)
@@ -54,12 +60,16 @@ def batch(path, recursive, guess):
         else:
             tika_result = parser.from_file(f)
 
-        title = get_title(tika_result, guess=guess)
+        title = get_title(tika_result, guess=guess, uri=f)
 
         mid, norm_title, _ = iscc.meta_id(title)
         gmt = mime_to_gmt(media_type, file_path=f)
         if gmt == GMT.IMAGE:
-            cid = iscc.content_id_image(f)
+            try:
+                cid = iscc.content_id_image(f)
+            except Exception as e:
+                click.echo("Clould not proccess image: {} ({})".format(f, e))
+
         elif gmt == GMT.TEXT:
             text = tika_result["content"]
             if not text:

diff --git a/iscc_cli/dump.py → iscc_cli/commands/dump.py b/iscc_cli/dump.py → iscc_cli/commands/dump.py
@@ -11,26 +11,27 @@
 
 
 @click.command(cls=DefaultHelp)
-@click.argument("file", type=click.File("rb"))
+@click.argument("path", type=click.STRING)
 @click.option(
     "-s", "--strip", type=click.INT, default=0, help="Strip content to first X chars."
 )
 @click.option("-m", "--meta", is_flag=True, default=False, help="Dump metadata only.")
 @click.option("-c", "--content", is_flag=True, default=False, help="Dump content only.")
-def dump(file, strip, meta, content):
-    """Dump Tika extraction results for FILE."""
+def dump(path, strip, meta, content):
+    """Dump Tika extraction results for PATH (file or url path)."""
+
+    media_type = detector.from_file(path)
 
-    media_type = detector.from_file(file.name)
     if media_type not in SUPPORTED_MIME_TYPES:
         click.echo("Unsupported media type {}.".format(media_type))
         click.echo("Please request support at https://github.com/iscc/iscc-cli/issues")
 
     if media_type == "application/x-mobipocket-ebook":
-        tempdir, epub_filepath = mobi.extract(file.name)
+        tempdir, epub_filepath = mobi.extract(path)
         tika_result = parser.from_file(epub_filepath)
         shutil.rmtree(tempdir)
     else:
-        tika_result = parser.from_file(file.name)
+        tika_result = parser.from_file(path)
 
     if all([meta, content]):
         raise UsageError("Use either --meta or --content for selective output.")

diff --git a/iscc_cli/gen.py → iscc_cli/commands/gen.py b/iscc_cli/gen.py → iscc_cli/commands/gen.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+import os
 import shutil
 from os.path import abspath
 
@@ -28,6 +29,10 @@
 @click.option("-v", "--verbose", is_flag=True, help="Enables verbose mode.")
 def gen(file, guess, title, extra, verbose):
     """Generate ISCC Code for FILE."""
+    filesize = os.path.getsize(file.name)
+    if not filesize:
+        raise click.BadParameter("Cannot proccess empty file: {}".format(file.name))
+
     media_type = detector.from_file(file.name)
     if media_type not in SUPPORTED_MIME_TYPES:
         click.echo("Unsupported media type {}.".format(media_type))
@@ -41,7 +46,7 @@ def gen(file, guess, title, extra, verbose):
         tika_result = parser.from_file(file.name)
 
     if not title:
-        title = get_title(tika_result, guess=guess)
+        title = get_title(tika_result, guess=guess, uri=file.name)
 
     if not extra:
         extra = ""

diff --git a/iscc_cli/info.py → iscc_cli/commands/info.py b/iscc_cli/info.py → iscc_cli/commands/info.py
diff --git a/iscc_cli/init.py → iscc_cli/commands/init.py b/iscc_cli/init.py → iscc_cli/commands/init.py
diff --git a/iscc_cli/sim.py → iscc_cli/commands/sim.py b/iscc_cli/sim.py → iscc_cli/commands/sim.py
diff --git a/iscc_cli/test.py → iscc_cli/commands/test.py b/iscc_cli/test.py → iscc_cli/commands/test.py
diff --git a/iscc_cli/web.py → iscc_cli/commands/web.py b/iscc_cli/web.py → iscc_cli/commands/web.py
@@ -82,7 +82,7 @@ def web(url, guess, title, extra, verbose):
         tika_result = parser.from_buffer(data)
 
     if not title:
-        title = get_title(tika_result, guess=guess)
+        title = get_title(tika_result, guess=guess, uri=url)
 
     mid, norm_title, _ = iscc.meta_id(title, extra)
     gmt = mime_to_gmt(media_type)

diff --git a/iscc_cli/fpcalc.py b/iscc_cli/fpcalc.py
@@ -12,7 +12,7 @@
 from iscc_cli.utils import download_file
 
 
-FPCALC_VERSION = "1.4.3"
+FPCALC_VERSION = "1.5.0"
 FPCALC_URL_BASE = "https://github.com/acoustid/chromaprint/releases/download/v{}/".format(
     FPCALC_VERSION
 )
@@ -24,10 +24,10 @@
 
 
 def exe_path():
-    """Returns patth to fpcalc executable."""
+    """Returns path to fpcalc executable."""
     if platform.system() == "Windows":
-        return os.path.join(iscc_cli.APP_DIR, "fpcalc.exe")
-    return os.path.join(iscc_cli.APP_DIR, "fpcalc")
+        return os.path.join(iscc_cli.APP_DIR, "fpcalc-{}.exe".format(FPCALC_VERSION))
+    return os.path.join(iscc_cli.APP_DIR, "fpcalc-{}".format(FPCALC_VERSION))
 
 
 def is_installed():
@@ -47,6 +47,7 @@ def download():
 
 
 def extract(archive):
+    """Extract archive with fpcalc executable."""
     if archive.endswith(".zip"):
         with zipfile.ZipFile(archive, "r") as zip_file:
             for member in zip_file.namelist():
@@ -83,6 +84,6 @@ def get_version_info():
     """Get fpcalc version"""
     try:
         r = subprocess.run([exe_path(), "-v"], stdout=subprocess.PIPE)
-        return r.stdout.decode("utf-8").strip().split()[-1]
+        return r.stdout.decode("utf-8").strip().split()[2]
     except FileNotFoundError:
         return 'WARNING: Not Installed - run "iscc init" to install!'
diff --git a/iscc_cli/lib.py b/iscc_cli/lib.py
@@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-
 """Expose cli commands with standard python api."""
 from typing import List, Dict
-from iscc_cli.gen import gen
-from iscc_cli.batch import batch
-from iscc_cli.web import web
+from iscc_cli.commands.gen import gen
+from iscc_cli.commands.batch import batch
+from iscc_cli.commands.web import web
 
 
 def iscc_from_file(file, guess=False, title="", extra="") -> Dict:

diff --git a/iscc_cli/utils.py b/iscc_cli/utils.py
@@ -5,7 +5,7 @@
 import re
 import textwrap
 from os import getcwd, listdir, walk
-from os.path import isfile, splitext, isdir, join
+from os.path import isfile, splitext, isdir, join, basename
 from urllib.parse import urlparse
 import click
 import iscc
@@ -76,15 +76,16 @@ def mime_to_gmt(mime_type, file_path=None):
         return gmt
 
 
-def get_title(tika_result: dict, guess=False):
+def get_title(tika_result: dict, guess=False, uri=None):
     title = ""
 
     meta = tika_result.get("metadata")
     if meta:
         title = meta.get("dc:title", "")
         title = title[0].strip() if isinstance(title, list) else title.strip()
         if not title:
-            title = meta.get("title", "").strip()
+            title = meta.get("title", "")
+            title = title[0].strip() if isinstance(title, list) else title.strip()
 
     # See if string would survive normalization
     norm_title = iscc.text_normalize(title, keep_ws=True)
@@ -95,6 +96,12 @@ def get_title(tika_result: dict, guess=False):
             first_line = content.strip().splitlines()[0]
             title = iscc.text_trim(iscc.text_normalize(first_line, keep_ws=True))
 
+    if not title and uri is not None:
+        result = urlparse(uri)
+        base = basename(result.path)
+        title = splitext(base)[0]
+        title = title.replace("-", " ")
+        title = title.replace("_", " ")
     return title