HOWTO: Find Duplicated Files in Directory Tree

vermaden · Apr 24, 2010

I have written this script some time ago to find duplicated files, they may be compared by file name, size or md5 checksum. Feel free to point any issues with it.

It works with FreeBSD and Linux, partially ported to Solaris, but if md5(1) and stat(1) have the same syntax as in FreeBSD (or Linux) it may also be used on other BSDs or any other UNIX system.

Example output/usage:

Code:

% [color="Blue"]duplicated_files.sh[/color]
usage: duplicated_files.sh OPTION DIRECTORY
  OPTIONS: -n   check by name (fast)
           -s   check by size (medium)
           -m   check by md5  (slow)
           -N   same as '-n' but with delete instructions printed
           -S   same as '-s' but with delete instructions printed
           -M   same as '-m' but with delete instructions printed
  EXAMPLE: duplicated_files.sh -s /mnt

Code:

% [color="Blue"]duplicated_files.sh -m tmp[/color]
count: 2 | md5: eb36b88619424b05288a0a8918b822f0
  tmp/segoeuib.ttf
  tmp/test/segoeuib.ttf

count: 3 | md5: 4e1e3521a4396110e59229bed85b0cf9
  tmp/cam/fdd/file.htm
  tmp/cam/gf/file.htm
  tmp/cam/nf7/file.htm

Code:

% [color="Blue"]duplicated_files.sh -N tmp[/color]
count: 2 | file: segoeuil.ttf
  sudo rm -rf "tmp/segoeuil.ttf"
  sudo rm -rf "tmp/test/segoeuil.ttf"

count: 3 | file: file.htm
  sudo rm -rf "tmp/cam/nf7/file.htm"
  sudo rm -rf "tmp/cam/gf/file.htm"
  sudo rm -rf "tmp/cam/fdd/file.htm"

duplicated_files.sh

Code:

#! /bin/sh

# find duplicated files in directory tree
# comparing by file NAME, SIZE or MD5 checksum
# --------------------------------------------
# LICENSE(s): BSD / CDDL
# --------------------------------------------
# vermaden [AT] interia [DOT] pl
# http://strony.toya.net.pl/~vermaden/links.htm

__usage() {
  echo "usage: $( basename ${0} ) OPTION DIRECTORY"
  echo "  OPTIONS: -n   check by name (fast)"
  echo "           -s   check by size (medium)"
  echo "           -m   check by md5  (slow)"
  echo "           -N   same as '-n' but with delete instructions printed"
  echo "           -S   same as '-s' but with delete instructions printed"
  echo "           -M   same as '-m' but with delete instructions printed"
  echo "  EXAMPLE: $( basename ${0} ) -s /mnt"
  exit 1
  }

__prefix() {
  case $( id -u ) in
    (0) PREFIX="rm -rf" ;;
    (*) case $( uname ) in
          (SunOS) PREFIX="pfexec rm -rf" ;;
          (*)     PREFIX="sudo rm -rf"   ;;
        esac
        ;;
  esac
  }

__crossplatform() {
  case $( uname ) in
    (FreeBSD)
      MD5="md5 -r"
      STAT="stat -f %z"
      ;;
    (Linux)
      MD5="md5sum"
      STAT="stat -c %s"
      ;;
    (SunOS)
      echo "INFO: supported systems: FreeBSD Linux"
      echo
      echo "Porting to Solaris/OpenSolaris"
      echo "  -- provide values for MD5/STAT in '$( basename ${0} ):__crossplatform()'"
      echo "  -- use digest(1) instead for md5 sum calculation"
      echo "       $ digest -a md5 file"
      echo "  -- pfexec(1) is already used in '$( basename ${0} ):__prefix()'"
      echo
      exit 1
    (*)
      echo "INFO: supported systems: FreeBSD Linux"
      exit 1
      ;;
  esac
  }

__md5() {
  __crossplatform
  :> ${DUPLICATES_FILE}
  DATA=$( find "${1}" -type f -exec ${MD5} {} ';' | sort -n )
  echo "${DATA}" \
    | awk '{print $1}' \
    | uniq -c \
    | while read LINE
      do
        COUNT=$( echo ${LINE} | awk '{print $1}' )
        [ ${COUNT} -eq 1 ] && continue
        SUM=$( echo ${LINE} | awk '{print $2}' )
        echo "${DATA}" | grep ${SUM} >> ${DUPLICATES_FILE}
      done

  echo "${DATA}" \
    | awk '{print $1}' \
    | sort -n \
    | uniq -c \
    | while read LINE
      do
        COUNT=$( echo ${LINE} | awk '{print $1}' )
        [ ${COUNT} -eq 1 ] && continue
        SUM=$( echo ${LINE} | awk '{print $2}' )
        echo "count: ${COUNT} | md5: ${SUM}"
        grep ${SUM} ${DUPLICATES_FILE} \
          | cut -d ' ' -f 2-10000 2> /dev/null \
          | while read LINE
            do
              if [ -n "${PREFIX}" ]
              then
                echo "  ${PREFIX} \"${LINE}\""
              else
                echo "  ${LINE}"
              fi
            done
        echo
      done
  rm -rf ${DUPLICATES_FILE}
  }

__size() {
  __crossplatform
  find "${1}" -type f -exec ${STAT} {} ';' \
    | sort -n \
    | uniq -c \
    | while read LINE
      do
        COUNT=$( echo ${LINE} | awk '{print $1}' )
        [ ${COUNT} -eq 1 ] && continue
        SIZE=$( echo ${LINE} | awk '{print $2}' )
        SIZE_KB=$( echo ${SIZE} / 1024 | bc )
        echo "count: ${COUNT} | size: ${SIZE_KB}KB (${SIZE} bytes)"
        if [ -n "${PREFIX}" ]
        then
          find ${1} -type f -size ${SIZE}c -exec echo "  ${PREFIX} \"{}\"" ';'
        else
          find ${1} -type f -size ${SIZE}c -exec echo "  {}" ';'
        fi
        echo
      done
  }

__file() {
  __crossplatform
  find "${1}" -type f \
    | xargs -n 1 basename 2> /dev/null \
    | tr '[A-Z]' '[a-z]' \
    | sort -n \
    | uniq -c \
    | sort -n -r \
    | while read LINE
      do
        COUNT=$( echo ${LINE} | awk '{print $1}' )
        [ ${COUNT} -eq 1 ] && break
        FILE=$( echo ${LINE} | cut -d ' ' -f 2-10000 2> /dev/null )
        echo "count: ${COUNT} | file: ${FILE}"
        FILE=$( echo ${FILE} | sed -e s/'\['/'\\\['/g -e s/'\]'/'\\\]'/g )
        if [ -n "${PREFIX}" ]
        then
          find ${1} -iname "${FILE}" -exec echo "  ${PREFIX} \"{}\"" ';'
        else
          find ${1} -iname "${FILE}" -exec echo "  {}" ';'
        fi
        echo
      done 
  }

# main()

[ ${#} -ne 2  ] && __usage
[ ! -d "${2}" ] && __usage

DUPLICATES_FILE="/tmp/$( basename ${0} )_DUPLICATES_FILE.tmp"

case ${1} in
  (-n)           __file "${2}" ;;
  (-m)           __md5  "${2}" ;;
  (-s)           __size "${2}" ;;
  (-N) __prefix; __file "${2}" ;;
  (-M) __prefix; __md5  "${2}" ;;
  (-S) __prefix; __size "${2}" ;;
  (*)  __usage ;;
esac

sixtydoses · Apr 24, 2010

Code:

-N   same as '-f' but with delete instructions printed

Guess there's a typo there.

Script's working great (testing out on small directories). Thanks.

vermaden · Apr 24, 2010

@sixtydoses

Thanks.

dennylin93 · May 1, 2010

Is it possible to do multiple checks, i.e., check file size first, then check MD5 if file sizes are the same.

vermaden · May 1, 2010

@dennylin93

Currently no, but I will be working on version which will first compare sizes, and if sizes are the same, compare md5 checksum.

anomie · May 3, 2010

@vermaden: That is nice, clean Bourne shell code.

I found something when testing on a Fedora box.

Code:

$ cat /etc/redhat-release 
Fedora release 11 (Leonidas)

$ ./duplicated_files.sh -n music
count: 2 | file: 

count: 2 | file: 

count: 2 | file: 

count: 2 | file: 

count: 2 | file: 

...

I made a quick change shown in the unified diff:

Code:

--- duplicated_files.sh	2010-05-03 12:54:45.983553694 -0500
+++ duplicated_files.NEW.sh	2010-05-03 13:03:08.788553799 -0500
@@ -133,7 +133,7 @@
       do
         COUNT=$( echo ${LINE} | awk '{print $1}' )
         [ ${COUNT} -eq 1 ] && break
-        FILE=$( echo ${LINE} | cut -d ' ' -f 2-10000 | 2> /dev/null )
+        FILE=$( echo ${LINE} | cut -d ' ' -f 2-10000 2> /dev/null )
         echo "count: ${COUNT} | file: ${FILE}"
         FILE=$( echo ${FILE} | sed -e s/'\['/'\\\['/g -e s/'\]'/'\\\]'/g )
         if [ -n "${PREFIX}" ]

Now it shows:

Code:

$ cat /etc/redhat-release 
Fedora release 11 (Leonidas)

$ ./duplicated_files.NEW.sh -n music
count: 2 | file: dj-shadow13.mp3
  music/mytunes/home/me/music/dj-shadow/dj-shadow13.mp3
  music/dj-shadow/dj-shadow13.mp3

count: 2 | file: dj-shadow12.mp3
  music/mytunes/home/me/music/dj-shadow/dj-shadow12.mp3
  music/dj-shadow/dj-shadow12.mp3

count: 2 | file: dj-shadow11.mp3
  music/mytunes/home/me/music/dj-shadow/dj-shadow11.mp3
  music/dj-shadow/dj-shadow11.mp3

count: 2 | file: dj-shadow10.mp3
  music/mytunes/home/me/music/dj-shadow/dj-shadow10.mp3
  music/dj-shadow/dj-shadow10.mp3

count: 2 | file: dj-shadow09.mp3
  music/mytunes/home/me/music/dj-shadow/dj-shadow09.mp3
  music/dj-shadow/dj-shadow09.mp3

...

I didn't do much in the way of regression testing. So hopefully I didn't break something else with that change.

vermaden · May 3, 2010

anomie said:
@vermaden: That is nice, clean Bourne shell code.

I found something when testing on a Fedora box.

Hi and thanks, the sad thing is that I had the correct version on hard disk while I added (wrong) changes to the 'online' version

It meant to be like that from the beginning:

Code:

+        FILE=$( echo ${LINE} | cut -d ' ' -f 2-10000 2> /dev/null )

mfaridi · Dec 21, 2010

when I run that script my terminal prompt change from

Code:

narges ~ [277]

to something like this

Code:

%{%}%n%{%} %{%}%~%{%} %{%}[%!]%{%}

this is happen in normal user and root user too
normal user use ZSH Shell and root use default shell
and nothing happen and I have to type exit or use ctrl+c to exit

vermaden · Dec 21, 2010

nothing happen and I have to type exit or use ctrl+c to exit

It takes time to calculate all needed data, on how big directory You have run it?

mfaridi · Dec 21, 2010

vermaden said:
It takes time to calculate all needed data, on how big directory You have run it?

but I want understand whey my shell prompt change
the directory is not so big

vermaden · Dec 21, 2010

I have to type exit or use ctrl+c to exit

Do not do that and wait until it finishes ...

mfaridi · Dec 21, 2010

vermaden said:
Do not do that and wait until it finishes ...

if I run this command

Code:

./duplicated_files.sh -n /home/narges/i686/

I see this

Code:

%{%}%n%{%} %{%}%~%{%} %{%}[%!]%{%} 
%{%}%n%{%} %{%}%~%{%} %{%}[%!]%{%} 
%{%}%n%{%} %{%}%~%{%} %{%}[%!]%{%} 
%{%}%n%{%} %{%}%~%{%} %{%}[%!]%{%} 
%{%}%n%{%} %{%}%~%{%} %{%}[%!]%{%} 
%{%}%n%{%} %{%}%~%{%} %{%}[%!]%{%} 
%{%}%n%{%} %{%}%~%{%} %{%}[%!]%{%} 
%{%}%n%{%} %{%}%~%{%} %{%}[%!]%{%} 
%{%}%n%{%} %{%}%~%{%} %{%}[%!]%{%} 
%{%}%n%{%} %{%}%~%{%} %{%}[%!]%{%}

vermaden · Dec 21, 2010

@mfaridi

Copy the whole script again from this thread into your disk and check again.

mfaridi · Dec 21, 2010

vermaden said:
@mfaridi

Copy the whole script again from this thread into your disk and check again.

Thanks
right now it work without problem , and work fast and shell prompt does not change
what was problem ?

vermaden · Dec 21, 2010

mfaridi said:
Thanks
right now it work without problem , and work fast and shell prompt does not change
what was problem ?

You tell me, I did not had such problems

mfaridi · Dec 21, 2010

vermaden said:
You tell me, I did not had such problems

I delete last one and make new one and I do not do nothing , only delete all line in duplicated_files.sh and copy all line from your howto and run it
I think you first script has problem and make that problem for me and change shell prompt style and stop and does not work , but new one work good for me and does not change shell prompt style

vermaden · Dec 21, 2010

The old one did not changed shell prompt