Skip to content

Commit

Permalink
dedupe: add --by-hash to dedupe on hash not file name - fixes rclone#…
Browse files Browse the repository at this point in the history
  • Loading branch information
ncw committed Dec 2, 2020
1 parent e073720 commit 507f861
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 45 deletions.
36 changes: 24 additions & 12 deletions cmd/dedupe/dedupe.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ import (

var (
dedupeMode = operations.DeduplicateInteractive
byHash = false
)

func init() {
cmd.Root.AddCommand(commandDefinition)
cmdFlag := commandDefinition.Flags()
flags.FVarP(cmdFlag, &dedupeMode, "dedupe-mode", "", "Dedupe mode interactive|skip|first|newest|oldest|largest|smallest|rename.")
flags.BoolVarP(cmdFlag, &byHash, "by-hash", "", false, "Find indentical hashes rather than names")
}

var commandDefinition = &cobra.Command{
Expand All @@ -27,20 +29,26 @@ var commandDefinition = &cobra.Command{
By default ` + "`dedupe`" + ` interactively finds files with duplicate
names and offers to delete all but one or rename them to be
different.
different. This is known as deduping by name.
This is only useful with backends like Google Drive which can have
duplicate file names. It can be run on wrapping backends (e.g. crypt) if
they wrap a backend which supports duplicate file names.
Deduping by name is only useful with backends like Google Drive which
can have duplicate file names. It can be run on wrapping backends
(e.g. crypt) if they wrap a backend which supports duplicate file
names.
In the first pass it will merge directories with the same name. It
will do this iteratively until all the identically named directories
have been merged.
However if --by-hash is passed in then dedupe will find files with
duplicate hashes instead which will work on any backend which supports
at least one hash. This can be used to find files with duplicate
content. This is known as deduping by hash.
In the second pass, for every group of duplicate file names, it will
delete all but one identical files it finds without confirmation.
This means that for most duplicated files the ` + "`dedupe`" + `
command will not be interactive.
If deduping by name, first rclone will merge directories with the same
name. It will do this iteratively until all the identically named
directories have been merged.
Next, if deduping by name, for every group of duplicate file names /
hashes, it will delete all but one identical files it finds without
confirmation. This means that for most duplicated files the ` +
"`dedupe`" + ` command will not be interactive.
` + "`dedupe`" + ` considers files to be identical if they have the
same file path and the same hash. If the backend does not support hashes (e.g. crypt wrapping
Expand All @@ -49,6 +57,10 @@ use the ` + "`--size-only`" + ` flag then files will be considered
identical if they have the same size (any hash will be ignored). This
can be useful on crypt backends which do not support hashes.
Next rclone will resolve the remaining duplicates. Exactly which
action is taken depends on the dedupe mode. By default rclone will
interactively query the user for each one.
**Important**: Since this can cause data loss, test first with the
` + "`--dry-run` or the `--interactive`/`-i`" + ` flag.
Expand Down Expand Up @@ -131,7 +143,7 @@ Or
}
fdst := cmd.NewFsSrc(args)
cmd.Run(false, false, command, func() error {
return operations.Deduplicate(context.Background(), fdst, dedupeMode)
return operations.Deduplicate(context.Background(), fdst, dedupeMode, byHash)
})
},
}
78 changes: 54 additions & 24 deletions fs/operations/dedupe.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj
}

// dedupeInteractive interactively dedupes the slice of objects
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) {
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) {
fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
for i, o := range objs {
hashValue := ""
Expand All @@ -150,9 +150,17 @@ func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string
hashValue = err.Error()
}
}
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
if byHash {
fmt.Printf(" %d: %12d bytes, %s, %s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), o.Remote())
} else {
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
}
}
commands := []string{"sSkip and do nothing", "kKeep just one (choose which in next step)"}
if !byHash {
commands = append(commands, "rRename all to be different (by changing file.jpg to file-1.jpg)")
}
switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
switch config.Command(commands) {
case 's':
case 'k':
keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
Expand Down Expand Up @@ -298,31 +306,51 @@ func sortSmallestFirst(objs []fs.Object) {
// Deduplicate interactively finds duplicate files and offers to
// delete all but one or rename them to be different. Only useful with
// Google Drive which can have duplicate file names.
func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
fs.Infof(f, "Looking for duplicates using %v mode.", mode)
func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool) error {
ci := fs.GetConfig(ctx)
// find a hash to use
ht := f.Hashes().GetOne()
what := "names"
if byHash {
if ht == hash.None {
return errors.Errorf("%v has no hashes", f)
}
what = ht.String() + " hashes"
}
fs.Infof(f, "Looking for duplicate %s using %v mode.", what, mode)

// Find duplicate directories first and fix them
duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
if err != nil {
return err
}
if len(duplicateDirs) != 0 {
err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
if !byHash {
duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
if err != nil {
return err
}
if len(duplicateDirs) != 0 {
err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
if err != nil {
return err
}
}
}

// find a hash to use
ht := f.Hashes().GetOne()

// Now find duplicate files
files := map[string][]fs.Object{}
err = walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
err := walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
entries.ForObject(func(o fs.Object) {
remote := o.Remote()
files[remote] = append(files[remote], o)
var remote string
var err error
if byHash {
remote, err = o.Hash(ctx, ht)
if err != nil {
fs.Errorf(o, "Failed to hash: %v", err)
remote = ""
}
} else {
remote = o.Remote()
}
if remote != "" {
files[remote] = append(files[remote], o)
}
})
return nil
})
Expand All @@ -332,15 +360,17 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {

for remote, objs := range files {
if len(objs) > 1 {
fs.Logf(remote, "Found %d files with duplicate names", len(objs))
objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
if len(objs) <= 1 {
fs.Logf(remote, "All duplicates removed")
continue
fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
if !byHash {
objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
if len(objs) <= 1 {
fs.Logf(remote, "All duplicates removed")
continue
}
}
switch mode {
case DeduplicateInteractive:
dedupeInteractive(ctx, f, ht, remote, objs)
dedupeInteractive(ctx, f, ht, remote, objs, byHash)
case DeduplicateFirst:
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateNewest:
Expand All @@ -358,7 +388,7 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
sortSmallestFirst(objs)
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateSkip:
fs.Logf(remote, "Skipping %d files with duplicate names", len(objs))
fs.Logf(remote, "Skipping %d files with duplicate names %s", len(objs), what)
default:
//skip
}
Expand Down
45 changes: 36 additions & 9 deletions fs/operations/dedupe_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/rclone/rclone/fs/operations"
"github.com/rclone/rclone/fs/walk"
"github.com/rclone/rclone/fstest"
"github.com/rclone/rclone/lib/random"
"github.com/spf13/pflag"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand All @@ -36,6 +37,12 @@ func skipIfNoHash(t *testing.T, f fs.Fs) {
}
}

func skipIfNoModTime(t *testing.T, f fs.Fs) {
if f.Precision() >= fs.ModTimeNotSupported {
t.Skip("Can't run this test without modtimes")
}
}

func TestDeduplicateInteractive(t *testing.T) {
r := fstest.NewRun(t)
defer r.Finalise()
Expand All @@ -47,7 +54,7 @@ func TestDeduplicateInteractive(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
r.CheckWithDuplicates(t, file1, file2, file3)

err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateInteractive)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateInteractive, false)
require.NoError(t, err)

fstest.CheckItems(t, r.Fremote, file1)
Expand All @@ -69,7 +76,7 @@ func TestDeduplicateSkip(t *testing.T) {
files = append(files, file3)
r.CheckWithDuplicates(t, files...)

err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip, false)
require.NoError(t, err)

r.CheckWithDuplicates(t, file1, file3)
Expand All @@ -92,7 +99,7 @@ func TestDeduplicateSizeOnly(t *testing.T) {
ci.SizeOnly = false
}()

err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip, false)
require.NoError(t, err)

r.CheckWithDuplicates(t, file1, file3)
Expand All @@ -108,7 +115,7 @@ func TestDeduplicateFirst(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is one BB", t1)
r.CheckWithDuplicates(t, file1, file2, file3)

err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateFirst)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateFirst, false)
require.NoError(t, err)

// list until we get one object
Expand All @@ -131,18 +138,38 @@ func TestDeduplicateNewest(t *testing.T) {
r := fstest.NewRun(t)
defer r.Finalise()
skipIfCantDedupe(t, r.Fremote)
skipIfNoModTime(t, r.Fremote)

file1 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
file2 := r.WriteUncheckedObject(context.Background(), "one", "This is one too", t2)
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
r.CheckWithDuplicates(t, file1, file2, file3)

err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest, false)
require.NoError(t, err)

fstest.CheckItems(t, r.Fremote, file3)
}

func TestDeduplicateNewestByHash(t *testing.T) {
r := fstest.NewRun(t)
defer r.Finalise()
skipIfNoHash(t, r.Fremote)
skipIfNoModTime(t, r.Fremote)
contents := random.String(100)

file1 := r.WriteObject(context.Background(), "one", contents, t1)
file2 := r.WriteObject(context.Background(), "also/one", contents, t2)
file3 := r.WriteObject(context.Background(), "another", contents, t3)
file4 := r.WriteObject(context.Background(), "not-one", "stuff", t3)
fstest.CheckItems(t, r.Fremote, file1, file2, file3, file4)

err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest, true)
require.NoError(t, err)

fstest.CheckItems(t, r.Fremote, file3, file4)
}

func TestDeduplicateOldest(t *testing.T) {
r := fstest.NewRun(t)
defer r.Finalise()
Expand All @@ -153,7 +180,7 @@ func TestDeduplicateOldest(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
r.CheckWithDuplicates(t, file1, file2, file3)

err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateOldest)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateOldest, false)
require.NoError(t, err)

fstest.CheckItems(t, r.Fremote, file1)
Expand All @@ -169,7 +196,7 @@ func TestDeduplicateLargest(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
r.CheckWithDuplicates(t, file1, file2, file3)

err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateLargest)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateLargest, false)
require.NoError(t, err)

fstest.CheckItems(t, r.Fremote, file3)
Expand All @@ -185,7 +212,7 @@ func TestDeduplicateSmallest(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
r.CheckWithDuplicates(t, file1, file2, file3)

err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSmallest)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSmallest, false)
require.NoError(t, err)

fstest.CheckItems(t, r.Fremote, file1)
Expand All @@ -202,7 +229,7 @@ func TestDeduplicateRename(t *testing.T) {
file4 := r.WriteUncheckedObject(context.Background(), "one-1.txt", "This is not a duplicate", t1)
r.CheckWithDuplicates(t, file1, file2, file3, file4)

err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateRename)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateRename, false)
require.NoError(t, err)

require.NoError(t, walk.ListR(context.Background(), r.Fremote, "", true, -1, walk.ListObjects, func(entries fs.DirEntries) error {
Expand Down

0 comments on commit 507f861

Please sign in to comment.