dedupe: add --by-hash to dedupe on hash not file name - fixes rclone#…

…1674
qiangli · Dec 2, 2020 · 507f861 · 507f861
1 parent e073720
commit 507f861
Show file tree

Hide file tree

Showing 3 changed files with 114 additions and 45 deletions.
diff --git a/cmd/dedupe/dedupe.go b/cmd/dedupe/dedupe.go
@@ -12,12 +12,14 @@ import (
 
 var (
 	dedupeMode = operations.DeduplicateInteractive
+	byHash     = false
 )
 
 func init() {
 	cmd.Root.AddCommand(commandDefinition)
 	cmdFlag := commandDefinition.Flags()
 	flags.FVarP(cmdFlag, &dedupeMode, "dedupe-mode", "", "Dedupe mode interactive|skip|first|newest|oldest|largest|smallest|rename.")
+	flags.BoolVarP(cmdFlag, &byHash, "by-hash", "", false, "Find indentical hashes rather than names")
 }
 
 var commandDefinition = &cobra.Command{
@@ -27,20 +29,26 @@ var commandDefinition = &cobra.Command{
 
 By default ` + "`dedupe`" + ` interactively finds files with duplicate
 names and offers to delete all but one or rename them to be
-different.
+different. This is known as deduping by name.
 
-This is only useful with backends like Google Drive which can have
-duplicate file names. It can be run on wrapping backends (e.g. crypt) if
-they wrap a backend which supports duplicate file names.
+Deduping by name is only useful with backends like Google Drive which
+can have duplicate file names. It can be run on wrapping backends
+(e.g. crypt) if they wrap a backend which supports duplicate file
+names.
 
-In the first pass it will merge directories with the same name.  It
-will do this iteratively until all the identically named directories
-have been merged.
+However if --by-hash is passed in then dedupe will find files with
+duplicate hashes instead which will work on any backend which supports
+at least one hash. This can be used to find files with duplicate
+content. This is known as deduping by hash.
 
-In the second pass, for every group of duplicate file names, it will
-delete all but one identical files it finds without confirmation.
-This means that for most duplicated files the ` + "`dedupe`" + `
-command will not be interactive.
+If deduping by name, first rclone will merge directories with the same
+name.  It will do this iteratively until all the identically named
+directories have been merged.
+
+Next, if deduping by name, for every group of duplicate file names /
+hashes, it will delete all but one identical files it finds without
+confirmation.  This means that for most duplicated files the ` +
+		"`dedupe`" + ` command will not be interactive.
 
 ` + "`dedupe`" + ` considers files to be identical if they have the
 same file path and the same hash. If the backend does not support hashes (e.g. crypt wrapping
@@ -49,6 +57,10 @@ use the ` + "`--size-only`" + ` flag then files will be considered
 identical if they have the same size (any hash will be ignored). This
 can be useful on crypt backends which do not support hashes.
 
+Next rclone will resolve the remaining duplicates. Exactly which
+action is taken depends on the dedupe mode. By default rclone will
+interactively query the user for each one.
+
 **Important**: Since this can cause data loss, test first with the
 ` + "`--dry-run` or the `--interactive`/`-i`" + ` flag.
 
@@ -131,7 +143,7 @@ Or
 		}
 		fdst := cmd.NewFsSrc(args)
 		cmd.Run(false, false, command, func() error {
-			return operations.Deduplicate(context.Background(), fdst, dedupeMode)
+			return operations.Deduplicate(context.Background(), fdst, dedupeMode, byHash)
 		})
 	},
 }
diff --git a/fs/operations/dedupe.go b/fs/operations/dedupe.go
@@ -139,7 +139,7 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj
 }
 
 // dedupeInteractive interactively dedupes the slice of objects
-func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) {
+func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) {
 	fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
 	for i, o := range objs {
 		hashValue := ""
@@ -150,9 +150,17 @@ func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string
 				hashValue = err.Error()
 			}
 		}
-		fmt.Printf("  %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
+		if byHash {
+			fmt.Printf("  %d: %12d bytes, %s, %s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), o.Remote())
+		} else {
+			fmt.Printf("  %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
+		}
+	}
+	commands := []string{"sSkip and do nothing", "kKeep just one (choose which in next step)"}
+	if !byHash {
+		commands = append(commands, "rRename all to be different (by changing file.jpg to file-1.jpg)")
 	}
-	switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
+	switch config.Command(commands) {
 	case 's':
 	case 'k':
 		keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
@@ -298,31 +306,51 @@ func sortSmallestFirst(objs []fs.Object) {
 // Deduplicate interactively finds duplicate files and offers to
 // delete all but one or rename them to be different. Only useful with
 // Google Drive which can have duplicate file names.
-func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
-	fs.Infof(f, "Looking for duplicates using %v mode.", mode)
+func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool) error {
 	ci := fs.GetConfig(ctx)
+	// find a hash to use
+	ht := f.Hashes().GetOne()
+	what := "names"
+	if byHash {
+		if ht == hash.None {
+			return errors.Errorf("%v has no hashes", f)
+		}
+		what = ht.String() + " hashes"
+	}
+	fs.Infof(f, "Looking for duplicate %s using %v mode.", what, mode)
 
 	// Find duplicate directories first and fix them
-	duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
-	if err != nil {
-		return err
-	}
-	if len(duplicateDirs) != 0 {
-		err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
+	if !byHash {
+		duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
 		if err != nil {
 			return err
 		}
+		if len(duplicateDirs) != 0 {
+			err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
+			if err != nil {
+				return err
+			}
+		}
 	}
 
-	// find a hash to use
-	ht := f.Hashes().GetOne()
-
 	// Now find duplicate files
 	files := map[string][]fs.Object{}
-	err = walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
+	err := walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
 		entries.ForObject(func(o fs.Object) {
-			remote := o.Remote()
-			files[remote] = append(files[remote], o)
+			var remote string
+			var err error
+			if byHash {
+				remote, err = o.Hash(ctx, ht)
+				if err != nil {
+					fs.Errorf(o, "Failed to hash: %v", err)
+					remote = ""
+				}
+			} else {
+				remote = o.Remote()
+			}
+			if remote != "" {
+				files[remote] = append(files[remote], o)
+			}
 		})
 		return nil
 	})
@@ -332,15 +360,17 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
 
 	for remote, objs := range files {
 		if len(objs) > 1 {
-			fs.Logf(remote, "Found %d files with duplicate names", len(objs))
-			objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
-			if len(objs) <= 1 {
-				fs.Logf(remote, "All duplicates removed")
-				continue
+			fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
+			if !byHash {
+				objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
+				if len(objs) <= 1 {
+					fs.Logf(remote, "All duplicates removed")
+					continue
+				}
 			}
 			switch mode {
 			case DeduplicateInteractive:
-				dedupeInteractive(ctx, f, ht, remote, objs)
+				dedupeInteractive(ctx, f, ht, remote, objs, byHash)
 			case DeduplicateFirst:
 				dedupeDeleteAllButOne(ctx, 0, remote, objs)
 			case DeduplicateNewest:
@@ -358,7 +388,7 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
 				sortSmallestFirst(objs)
 				dedupeDeleteAllButOne(ctx, 0, remote, objs)
 			case DeduplicateSkip:
-				fs.Logf(remote, "Skipping %d files with duplicate names", len(objs))
+				fs.Logf(remote, "Skipping %d files with duplicate names %s", len(objs), what)
 			default:
 				//skip
 			}

diff --git a/fs/operations/dedupe_test.go b/fs/operations/dedupe_test.go
@@ -10,6 +10,7 @@ import (
 	"github.com/rclone/rclone/fs/operations"
 	"github.com/rclone/rclone/fs/walk"
 	"github.com/rclone/rclone/fstest"
+	"github.com/rclone/rclone/lib/random"
 	"github.com/spf13/pflag"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -36,6 +37,12 @@ func skipIfNoHash(t *testing.T, f fs.Fs) {
 	}
 }
 
+func skipIfNoModTime(t *testing.T, f fs.Fs) {
+	if f.Precision() >= fs.ModTimeNotSupported {
+		t.Skip("Can't run this test without modtimes")
+	}
+}
+
 func TestDeduplicateInteractive(t *testing.T) {
 	r := fstest.NewRun(t)
 	defer r.Finalise()
@@ -47,7 +54,7 @@ func TestDeduplicateInteractive(t *testing.T) {
 	file3 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
 	r.CheckWithDuplicates(t, file1, file2, file3)
 
-	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateInteractive)
+	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateInteractive, false)
 	require.NoError(t, err)
 
 	fstest.CheckItems(t, r.Fremote, file1)
@@ -69,7 +76,7 @@ func TestDeduplicateSkip(t *testing.T) {
 	files = append(files, file3)
 	r.CheckWithDuplicates(t, files...)
 
-	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip)
+	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip, false)
 	require.NoError(t, err)
 
 	r.CheckWithDuplicates(t, file1, file3)
@@ -92,7 +99,7 @@ func TestDeduplicateSizeOnly(t *testing.T) {
 		ci.SizeOnly = false
 	}()
 
-	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip)
+	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip, false)
 	require.NoError(t, err)
 
 	r.CheckWithDuplicates(t, file1, file3)
@@ -108,7 +115,7 @@ func TestDeduplicateFirst(t *testing.T) {
 	file3 := r.WriteUncheckedObject(context.Background(), "one", "This is one BB", t1)
 	r.CheckWithDuplicates(t, file1, file2, file3)
 
-	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateFirst)
+	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateFirst, false)
 	require.NoError(t, err)
 
 	// list until we get one object
@@ -131,18 +138,38 @@ func TestDeduplicateNewest(t *testing.T) {
 	r := fstest.NewRun(t)
 	defer r.Finalise()
 	skipIfCantDedupe(t, r.Fremote)
+	skipIfNoModTime(t, r.Fremote)
 
 	file1 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
 	file2 := r.WriteUncheckedObject(context.Background(), "one", "This is one too", t2)
 	file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
 	r.CheckWithDuplicates(t, file1, file2, file3)
 
-	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest)
+	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest, false)
 	require.NoError(t, err)
 
 	fstest.CheckItems(t, r.Fremote, file3)
 }
 
+func TestDeduplicateNewestByHash(t *testing.T) {
+	r := fstest.NewRun(t)
+	defer r.Finalise()
+	skipIfNoHash(t, r.Fremote)
+	skipIfNoModTime(t, r.Fremote)
+	contents := random.String(100)
+
+	file1 := r.WriteObject(context.Background(), "one", contents, t1)
+	file2 := r.WriteObject(context.Background(), "also/one", contents, t2)
+	file3 := r.WriteObject(context.Background(), "another", contents, t3)
+	file4 := r.WriteObject(context.Background(), "not-one", "stuff", t3)
+	fstest.CheckItems(t, r.Fremote, file1, file2, file3, file4)
+
+	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest, true)
+	require.NoError(t, err)
+
+	fstest.CheckItems(t, r.Fremote, file3, file4)
+}
+
 func TestDeduplicateOldest(t *testing.T) {
 	r := fstest.NewRun(t)
 	defer r.Finalise()
@@ -153,7 +180,7 @@ func TestDeduplicateOldest(t *testing.T) {
 	file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
 	r.CheckWithDuplicates(t, file1, file2, file3)
 
-	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateOldest)
+	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateOldest, false)
 	require.NoError(t, err)
 
 	fstest.CheckItems(t, r.Fremote, file1)
@@ -169,7 +196,7 @@ func TestDeduplicateLargest(t *testing.T) {
 	file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
 	r.CheckWithDuplicates(t, file1, file2, file3)
 
-	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateLargest)
+	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateLargest, false)
 	require.NoError(t, err)
 
 	fstest.CheckItems(t, r.Fremote, file3)
@@ -185,7 +212,7 @@ func TestDeduplicateSmallest(t *testing.T) {
 	file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
 	r.CheckWithDuplicates(t, file1, file2, file3)
 
-	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSmallest)
+	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSmallest, false)
 	require.NoError(t, err)
 
 	fstest.CheckItems(t, r.Fremote, file1)
@@ -202,7 +229,7 @@ func TestDeduplicateRename(t *testing.T) {
 	file4 := r.WriteUncheckedObject(context.Background(), "one-1.txt", "This is not a duplicate", t1)
 	r.CheckWithDuplicates(t, file1, file2, file3, file4)
 
-	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateRename)
+	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateRename, false)
 	require.NoError(t, err)
 
 	require.NoError(t, walk.ListR(context.Background(), r.Fremote, "", true, -1, walk.ListObjects, func(entries fs.DirEntries) error {