#!/bin/sh if [ ! "$1" ] then echo "Need a directory." exit 1 fi remove_from() { echo "$1" | tr ' ' '\n' | grep -v "^\s*$2\s*$" | tr '\n' ' ' } final_report="" DIRECTORY=$1 mids=$(find $DIRECTORY -name rejected -prune -o -name .mh_sequences -prune \ -o -type f -exec grep -i '^Message-Id: ' {} + | \ sed -e 's/^.*[Mm]essage-[Ii][Dd]:\s*//' | \ tr '^[]$()|*+?{}" \t' '...............' | \ sort | uniq) for mid in $mids do mid_mails=$(/bin/grep -l "^[Mm]essage-[Ii][Dd]:\s*$mid$" $DIRECTORY/* | tr '\012' ' ') if [ "$(echo $mid_mails | wc -w)" -gt 1 ] then # Take the first two, compare them, ask questions, remove one. # Repeat until only one left. Move the others. orig_mid_mails=$mid_mails while [ "$(echo $mid_mails | wc -w)" -gt 1 ] do echo "mm: $mid_mails" file1=$(echo $mid_mails | cut -d' ' -f 1) file2=$(echo $mid_mails | cut -d' ' -f 2) cat $file1 | sed -n '1,/^\s*$/p' | sort >/tmp/manual_dedup.1 cat $file2 | sed -n '1,/^\s*$/p' | sort >/tmp/manual_dedup.2 sdiff -W /tmp/manual_dedup.1 /tmp/manual_dedup.2 ; echo # Are there any differences? if [ ! "$(diff -w /tmp/manual_dedup.1 /tmp/manual_dedup.2)" ] then echo echo "No diffs found between $file1 and $file2" # Remove the second arbitrarily mid_mails=$(remove_from "$mid_mails" $file2) echo "mm after: $mid_mails" echo continue fi cat $file1 | sed -n '1,/^\s*$/p' | \ grep -v '^From ' | \ sort >/tmp/manual_dedup.1 cat $file2 | sed -n '1,/^\s*$/p' | \ grep -v '^From ' | \ sort >/tmp/manual_dedup.2 sdiff -W /tmp/manual_dedup.1 /tmp/manual_dedup.2 ; echo # Are there any differences now? if [ ! "$(diff -w /tmp/manual_dedup.1 /tmp/manual_dedup.2)" ] then echo echo "No diffs found between $file1 and $file2 after 'From ' removal." # Remove the one first one with 'From ' mid_mails=$(remove_from "$mid_mails" $(grep -l '^From ' $file1 $file2 | head -1)) echo "mm after: $mid_mails" echo continue fi cat $file1 | sed -n '1,/^\s*$/p' | \ grep -v '^From ' | \ grep -v '^X-From-Space-Address: ' | \ grep -v '^X-From-Space-Date: ' | \ sort >/tmp/manual_dedup.1 cat $file2 | sed -n '1,/^\s*$/p' | \ grep -v '^From ' | \ grep -v '^X-From-Space-Address: ' | \ grep -v '^X-From-Space-Date: ' | \ sort >/tmp/manual_dedup.2 sdiff -W /tmp/manual_dedup.1 /tmp/manual_dedup.2 ; echo # Are there any differences now? if [ ! "$(diff -w /tmp/manual_dedup.1 /tmp/manual_dedup.2)" ] then echo echo "No diffs found between $file1 and $file2 after 'From ', X-From-Space-Address:, and X-From-Space-Date: removal." # Remove the one first one with 'X-From-Space-' mid_mails=$(remove_from "$mid_mails" $(grep -l '^X-From-Space-' $file1 $file2 | head -1)) echo "mm after: $mid_mails" echo continue fi # Handle yahoogroups vs. chain local cat $file1 | formail -f \ -I 'Content-Disposition:' -I 'Lines:' \ -I 'X-Yahoo-Group-Post:' -I 'X-Yahoo-Profile:' -I 'X-List:' \ -I 'X-Yahoo-Message-Num:' -I 'Received:' -I 'Reply-to:' \ -I 'Content-Type:' -I 'Content-Length:' -I 'Delivered-To:' \ -I 'Errors-to:' -I 'Sender:' -I 'Mailing-List:' \ -I 'Mime-Version:' -I 'Return-Receipt-To:' \ -I 'Content-Transfer-Encoding:' -I 'Precedence:' \ -I 'X-archive-position:' -I 'X-ecartis-version:' \ -I 'X-eGroups-Return:' -I 'X-original-sender:' \ -I 'X-eGroups-From:' -I 'X-MIME-Autoconverted:' \ -I 'X-Yahoo-Newman-Property:' -I 'DomainKey-Signature:' \ -I 'X-Spam-Score:' -I 'X-eGroups-Msg-Info:' \ -I 'X-Originating-IP:' -I 'Comment:' \ -I 'X-Sender:' -I 'Return-Path:' -I 'X-Apparently-To:' | \ sed 's/^\(From: [^<]*<\).*/\1/' | \ sed 's/\s\s*[Rr][Ee]:\s\s*//g' | \ sed 's/[[]lojban[]]//' | sed -n '1,/^\s*$/p' | \ grep -v '^From ' | \ sort >/tmp/manual_dedup.1 cat $file2 | formail -f \ -I 'Content-Disposition:' -I 'Lines:' \ -I 'X-Yahoo-Group-Post:' -I 'X-Yahoo-Profile:' -I 'X-List:' \ -I 'X-Yahoo-Message-Num:' -I 'Received:' -I 'Reply-to:' \ -I 'Content-Type:' -I 'Content-Length:' -I 'Delivered-To:' \ -I 'Errors-to:' -I 'Sender:' -I 'Mailing-List:' \ -I 'Mime-Version:' -I 'Return-Receipt-To:' \ -I 'Content-Transfer-Encoding:' -I 'Precedence:' \ -I 'X-archive-position:' -I 'X-ecartis-version:' \ -I 'X-eGroups-Return:' -I 'X-original-sender:' \ -I 'X-eGroups-From:' -I 'X-MIME-Autoconverted:' \ -I 'X-Yahoo-Newman-Property:' -I 'DomainKey-Signature:' \ -I 'X-Spam-Score:' -I 'X-eGroups-Msg-Info:' \ -I 'X-Originating-IP:' -I 'Comment:' \ -I 'X-Sender:' -I 'Return-Path:' -I 'X-Apparently-To:' | \ sed 's/^\(From: [^<]*<\).*/\1/' | \ sed 's/\s\s*[Rr][Ee]:\s\s*//g' | \ sed 's/[[]lojban[]]//' | sed -n '1,/^\s*$/p' | \ grep -v '^From ' | \ sort >/tmp/manual_dedup.2 sdiff -W /tmp/manual_dedup.1 /tmp/manual_dedup.2 ; echo # Are there any differences now? if [ ! "$(diff -w /tmp/manual_dedup.1 /tmp/manual_dedup.2)" ] then echo echo "No diffs found between $file1 and $file2 after stripping chain/yahoo differences." echo echo echo "*************************************** CHAIN/YAHOO BODY DIFF, just in case ****************************************" cat $file1 | sed -n '/^\s*$/,/^\s*-------------/p' | grep -v '^X-List: ' >/tmp/manual_dedup.1 cat $file2 | sed -n '/^\s*$/,/^\s*-------------/p' | grep -v '^X-List: ' >/tmp/manual_dedup.2 sdiff -sW /tmp/manual_dedup.1 /tmp/manual_dedup.2 echo "*************************************** END CHAIN/YAHOO BODY DIFF ***************************************************" echo echo # Remove the one first one that looks like it's yahoo removals=$(grep -l '^X-Yahoo-Message-Num:' $file1 $file2 | head -1) if [ ! "$removals" ] then echo "Whoops, didn't pick anything to drop. Exiting." exit 1 fi mid_mails=$(remove_from "$mid_mails" $removals) echo "mm after: $mid_mails" echo continue fi # Handle certain onelist vs. yahoo situations cat $file1 | formail -f -I 'X-From-Space-Address:' \ -I 'X-From-Space-Date:' -I 'Received:' \ -I 'X-Yahoo-Message-Num:' -I 'Content-Length:' \ -I 'Lines:' -I 'MIME-Version:' -I 'Precedence:' \ -I 'Mailing-List:' -I 'List-Unsubscribe:' -I 'X-eGroups-Return:' \ -I 'Content-Type:' -I 'Content-Transfer-Encoding:' \ -I 'Content-Disposition:' \ -I 'Delivered-To:' -I 'Return-Path:' -I 'Reply-To:' | \ sed 's/[[]lojban[]]//' | sed -n '1,/^\s*$/p' | \ grep -v '^From ' | \ sort >/tmp/manual_dedup.1 cat $file2 | formail -f -I 'X-From-Space-Address:' \ -I 'X-From-Space-Date:' -I 'Received:' \ -I 'X-Yahoo-Message-Num:' -I 'Content-Length:' \ -I 'Lines:' -I 'MIME-Version:' -I 'Precedence:' \ -I 'Mailing-List:' -I 'List-Unsubscribe:' -I 'X-eGroups-Return:' \ -I 'Content-Type:' -I 'Content-Transfer-Encoding:' \ -I 'Content-Disposition:' \ -I 'Delivered-To:' -I 'Return-Path:' -I 'Reply-To:' | \ sed 's/[[]lojban[]]//' | sed -n '1,/^\s*$/p' | \ grep -v '^From ' | \ sort >/tmp/manual_dedup.2 sdiff -W /tmp/manual_dedup.1 /tmp/manual_dedup.2 ; echo # Are there any differences now? if [ ! "$(diff -w /tmp/manual_dedup.1 /tmp/manual_dedup.2)" ] then echo echo "No diffs found between $file1 and $file2 after stripping onelist/yahoo differences." echo echo echo "*************************************** ONELIST/YAHOO BODY DIFF, just in case ****************************************" cat $file1 | sed -n '/^\s*$/,/^-------------/p' >/tmp/manual_dedup.1 cat $file2 | sed -n '/^\s*$/,/^-------------/p' >/tmp/manual_dedup.2 sdiff -sW /tmp/manual_dedup.1 /tmp/manual_dedup.2 echo "*************************************** END ONELIST/YAHOO BODY DIFF ***************************************************" echo echo # Remove the one first one with '^Mailing-List:' mid_mails=$(remove_from "$mid_mails" $(grep -l '^Mailing-List:' $file1 $file2 | head -1)) echo "mm after: $mid_mails" echo continue else # There are still diffs; ask the user. answer="" while : do echo "OK, given the above, would you like to keep the (l)eft mail, (r)ight mail, see a (f)ull sdiff, or (s)kip?" read answer if [ "$answer" == "l" ] then # keep the lefit/first, so drop the right/second echo "mm before: $mid_mails" mid_mails=$(remove_from "$mid_mails" $file2) echo "mm after: $mid_mails" echo break fi if [ "$answer" == "r" ] then # keep the right/second, so drop the left/first echo "mm before: $mid_mails" mid_mails=$(remove_from "$mid_mails" $file1) echo "mm after: $mid_mails" echo break fi if [ "$answer" == "f" ] then sdiff -W $file1 $file2 | less fi if [ "$answer" == "s" ] then mid_mails='' break fi done fi done # We should have only one left, but it might have been skipped, # so check if [ "$(echo $mid_mails | wc -w)" -eq 1 ] then mkdir -p $DIRECTORY/rejected get_rid_of=$(remove_from "$orig_mid_mails" $mid_mails) echo "GRO: $orig_mid_mails -- $mid_mails -- $get_rid_of" for file in $get_rid_of do for num in $(seq 1 100000) do if [ ! -f $DIRECTORY/rejected/$num ] then echo "Moving $file to $DIRECTORY/rejected/$num" final_report="$final_report\nMoving $file to $DIRECTORY/rejected/$num" /bin/mv --backup=t $file $DIRECTORY/rejected/$num break fi done done fi else if [ "$(echo $mid_mails | wc -w)" -lt 1 ] then echo "Script bug; the MID $mid wasn't found anywhere." fi fi done rm /tmp/manual_dedup.* echo -e "FINAL REPORT: $final_report "