I’ve been meaning to share forward for a while.
This script reads the duplicate list created by Araxis Find Dup Files app and lists the number of dups in each folder tuple. I use this to find folders that have nearly duplicate contents. It includes a heap sort that can be customized to use different compare rules.
property value_test : missing value
-- This reads the Duplicates Report and counts the number of files for every pair or n-tuple of partent folders.
set tab_char to ASCII character 9
set lf_char to ASCII character 10
set duplicates_file_path_name to (choose file with prompt "Pick the Araxis Duplicates List to use") as string
set duplicates_file_id to open for access file duplicates_file_path_name
set log_file_path to (path to desktop as string) & "Folders with dups from Araxis.txt"
set table_list to {}
set file_id to open for access file log_file_path with write permission
write "Start " & (current date) & return & return to file_id starting at eof
set current_size to -1
set result_set to {}
set name_tuple to missing value
set type_count_set to {}
set name_tuple to missing value
try
repeat
set temp_text to read (duplicates_file_id as text) until lf_char as «class utf8»
if (count of characters in temp_text) > 0 then
set file_path_name to text 1 thru -2 of temp_text
set AppleScript's text item delimiters to "/"
set file_name to text item -1 of file_path_name
set AppleScript's text item delimiters to "."
set file_type to text item -1 of file_name
set AppleScript's text item delimiters to ""
if file_type = "DS_Store" then
--beep
else if file_type = "BridgeSort" then
--beep
else if file_type begins with "Icon" then
--beep
else if file_type = "BridgeLabelsAndRatings" then
--beep
else if file_type = "textClipping" then
--beep
else
set existing_record to false
repeat with curr_record in type_count_set
if file_type of curr_record = file_type then
set existing_record to true
set type_count of curr_record to (type_count of curr_record) + 1
exit repeat
end if
end repeat
if not existing_record then
set end of type_count_set to {file_type:file_type, type_count:1}
end if
set AppleScript's text item delimiters to "/"
set folder_path_name to text items 3 thru -2 of file_path_name
set file_path_name to text items 3 thru -1 of file_path_name
set AppleScript's text item delimiters to ":"
set file_path_name to file_path_name as string
set folder_path_name to folder_path_name as string
set AppleScript's text item delimiters to ""
tell application "Finder" to set file_size to size of file file_path_name
if file_size ≠current_size then
my increment_tuple_count(name_tuple, result_set)
set name_tuple to missing value
set current_size to file_size
end if
set name_tuple to my add_to_tuple(folder_path_name, name_tuple)
end if
end if
end repeat
on error number -39 -- eof
end try
close access duplicates_file_id
repeat with curr_record in type_count_set
write (((type_count of curr_record) as string) & tab_char & ((file_type of curr_record) as string) & return) to file_id starting at eof
end repeat
set list_count to count of items of result_set
write (list_count as string) & " Folder sets found" & return & return to file_id starting at eof
if list_count > 0 then
set value_test to get_first_item_in_name_tuple
set result_set to my heap_sort(result_set)
repeat with curr_record in result_set
if tuple_count of curr_record = 1 then
write ((tuple_count of curr_record) as string) & return to file_id starting at eof
repeat with folder_name in name_tuple of curr_record
tell application "Finder" to set folder_count to count of files in folder folder_name
tell application "Finder" to set folder_size to size of folder folder_name
write (folder_count as string) & (ASCII character 9) & (folder_size as string) & (ASCII character 9) & folder_name & return to file_id starting at eof
end repeat
write return to file_id starting at eof
end if
end repeat
repeat with curr_record in result_set
if tuple_count of curr_record = 2 then
write ((tuple_count of curr_record) as string) & return to file_id starting at eof
repeat with folder_name in name_tuple of curr_record
tell application "Finder" to set folder_count to count of files in folder folder_name
tell application "Finder" to set folder_size to size of folder folder_name
write (folder_count as string) & (ASCII character 9) & (folder_size as string) & (ASCII character 9) & folder_name & return to file_id starting at eof
end repeat
write return to file_id starting at eof
end if
end repeat
repeat with curr_record in result_set
if tuple_count of curr_record > 2 then
write ((tuple_count of curr_record) as string) & return to file_id starting at eof
repeat with folder_name in name_tuple of curr_record
tell application "Finder" to set folder_count to count of files in folder folder_name
tell application "Finder" to set folder_size to size of folder folder_name
write (folder_count as string) & (ASCII character 9) & (folder_size as string) & (ASCII character 9) & folder_name & return to file_id starting at eof
end repeat
write return to file_id starting at eof
end if
end repeat
else
write "No dups found" & return to file_id starting at eof
end if
write "End " & (current date) & return to file_id starting at eof
close access file_id
say "finished"
return
on add_to_tuple(location_name, name_tuple)
if name_tuple is missing value then
set name_tuple to {location_name}
else
set list_count to count of items of name_tuple
set end of name_tuple to location_name
repeat with list_index from 1 to list_count
if item (list_count + 1 - list_index) of name_tuple > location_name then
set item (list_count + 2 - list_index) of name_tuple to item (list_count + 1 - list_index) of name_tuple
if list_index = list_count then
set item 1 of name_tuple to location_name
end if
else
set item (list_count + 2 - list_index) of name_tuple to location_name
exit repeat
end if
end repeat
end if
return name_tuple
end add_to_tuple
on increment_tuple_count(name_tuple, result_set)
if name_tuple is not missing value then
set existing_record to false
repeat with curr_record in result_set
if name_tuple of curr_record = name_tuple then
set existing_record to true
set tuple_count of curr_record to (tuple_count of curr_record) + 1
exit repeat
end if
end repeat
if not existing_record then
set end of result_set to {name_tuple:name_tuple, tuple_count:1}
end if
end if
end increment_tuple_count
on heap_sort(set_to_sort)
set element_count to count of set_to_sort
log element_count
if element_count > 2 then
set mid_point to (element_count / 2) as integer
set set_a to my heap_sort(items 1 thru mid_point of set_to_sort)
set set_b to my heap_sort(items (mid_point + 1) thru element_count of set_to_sort)
set finished_set to my merge_sets(set_a, set_b)
else
if (count of items in set_to_sort) = 2 then
tell value_test to if value_to_test(1, set_to_sort) > value_to_test(2, set_to_sort) then
set temp_item to item 1 of set_to_sort
set item 1 of set_to_sort to item 2 of set_to_sort
set item 2 of set_to_sort to temp_item
end if
end if
set finished_set to set_to_sort
end if
log " new list"
repeat with current_item in finished_set
log " " & (item 1 of name_tuple of current_item) as string
end repeat
log " end merge" & return & " "
return finished_set
end heap_sort
on merge_sets(set_a, set_b)
set current_a to 1
set current_b to 1
set last_a to count of items of set_a
set last_b to count of items of set_b
set merged_set to {}
repeat while current_a ≤ last_a and current_b ≤ last_b
tell value_test to if value_to_test(current_a, set_a) < value_to_test(current_b, set_b) then
set end of merged_set to item current_a of set_a
set current_a to current_a + 1
else
set end of merged_set to item current_b of set_b
set current_b to current_b + 1
end if
end repeat
repeat while current_a ≤ last_a
set end of merged_set to item current_a of set_a
set current_a to current_a + 1
end repeat
repeat while current_b ≤ last_b
set end of merged_set to item current_b of set_b
set current_b to current_b + 1
end repeat
return merged_set
end merge_sets
script get_first_item_in_name_tuple
on value_to_test(curr_index, set_to_test)
return item 1 of name_tuple of item curr_index of set_to_test
end value_to_test
end script