String Conversion

I wrote an AppleScript that deletes pages from a PDF document. It uses a command-line utility to actually write the PDF file, but the PDF utility will not accept page numbers to delete. The alternative is to use page numbers that will be retained

I want to streamline a section of the script and hoped someone might be able to help. The user input is a string of the pages to delete as in the following:

“2 4-6 10”

I need to convert this for the PDF utility to a string as follows:

“1,3,7-9”

I’ve included below what I currently use.

Thanks.


--Variables from earlier in the script.
set userInput to "2 4-6 10"
set totalPages to 10

--Convert user input to a list.
set AppleScript's text item delimiters to " "
set userInput to text items of userInput
set AppleScript's text item delimiters to ""

--Set variable to individual pages to delete.
set deletePages to {}
set AppleScript's text item delimiters to "-"
repeat with i from 1 to (count userInput)
	set aPageReference to item i of userInput
	if aPageReference does not contain "-" then
		set the end of deletePages to aPageReference
	else
		set aPageReference to the text items of aPageReference
		repeat with j from (item 1 of aPageReference) as integer to (item 2 of aPageReference) as integer
			set the end of deletePages to (j as text)
		end repeat
	end if
end repeat
set AppleScript's text item delimiters to ""

--Set variables to individual pages to keep.
set keepPages to {} # as text
set keepPageNumbers to {} # as integers
repeat with i from 1 to totalPages
	if i as text is not in deletePages then
		set the end of keepPages to i as text
		set the end of keepPageNumbers to i
	end if
end repeat

--Convert individual pages to keep to page range for PDF utility.
set pdfPages to {}
repeat with i from 1 to (count keepPageNumbers) - 1
	if item i of keepPageNumbers ≠ (item (i + 1) of keepPageNumbers) - 1 then
		set the end of pdfPages to item i of keepPages & ","
	else if i = 1 or text -1 of item -1 of pdfPages ≠ "-" then
		set the end of pdfPages to item i of keepPages & "-"
	end if
end repeat
set the end of pdfPages to item -1 of keepPages
set pdfPages to pdfPages as text

get pdfPages # correctly returns "1,3,7-9"

Not a lot shorter, but this combines the last two repeats:


--Variables from earlier in the script.
set userInput to "2 4-6 10"
set totalPages to 10

--Convert user input to a list.
set astid to AppleScript's text item delimiters
set AppleScript's text item delimiters to {space, ","}
set userInput to text items of userInput

--Set variable to individual pages to delete.
set deletePages to {}
set AppleScript's text item delimiters to "-"
repeat with aPageReference in userInput
	if (aPageReference > "") then
		set aPageRange to text items of aPageReference
		repeat with j from (beginning of aPageRange) to (end of aPageRange)
			set the end of deletePages to j
		end repeat
	end if
end repeat

-- Collect the "ranges" of pages to keep.
set keepPages to {} # as text
set rangeStart to 0
repeat with i from 1 to totalPages
	set deleting to (i is in deletePages)
	if (not deleting) then
		set rangeEnd to i
		if (rangeStart is 0) then set rangeStart to rangeEnd
	end if
	if ((rangeStart > 0) and ((deleting) or (i = totalPages))) then
		if (rangeStart = rangeEnd) then
			set end of keepPages to rangeStart
		else
			set end of keepPages to (rangeStart as text) & "-" & rangeEnd
		end if
		set rangeStart to 0
	end if
end repeat
-- Join with comma delimiters.
set AppleScript's text item delimiters to ","
set pdfPages to keepPages as text
set AppleScript's text item delimiters to astid

get pdfPages # correctly returns "1,3,7-9"

Edit: Just thought of this too:

use AppleScript version "2.4" -- Yosemite (10.10) or later
use framework "Foundation"
use scripting additions

--Variables from earlier in the script.
set userInput to "2 4-6 10"
set totalPages to 10

--Convert user input to a list.
set astid to AppleScript's text item delimiters
set AppleScript's text item delimiters to {space, ","}
set userInput to text items of userInput

-- Get a mutable index set with indices matching all the page numbers in the document.
set pageNumbers to current application's class "NSMutableIndexSet"'s indexSetWithIndexesInRange:({1, totalPages})

-- Successively delete the specified page ranges from the set.
set AppleScript's text item delimiters to "-"
repeat with aPageReference in userInput
	if (aPageReference > "") then
		set aPageRange to text items of aPageReference
		tell pageNumbers to removeIndexesInRange:({(beginning of aPageRange) as integer, (end of aPageRange) - (beginning of aPageRange) + 1})
	end if
end repeat
set AppleScript's text item delimiters to astid

-- Parse and massage the index set's description to get the remaining ranges in the required text format. 
set desc to pageNumbers's |description|()
set parensRegex to current application's class "NSRegularExpression"'s regularExpressionWithPattern:("(?<=\\()[^)]++(?=\\))") options:(0) |error|:(missing value)
set parensMatches to parensRegex's matchesInString:(desc) options:(0) range:({0, desc's |length|()})
set relevantRange to parensMatches's lastObject()'s range()
set pdfPages to ((desc's substringWithRange:(relevantRange))'s stringByReplacingOccurrencesOfString:(space) withString:(",")) as text

Nigel,

Thanks for the response. I inserted your first suggestion in my script and it works great.

The code after the comment that begins with “Collect the ranges” is going to take some study, as will your second suggestion. I like to learn new ways to handle stuff, so I’ll enjoy that.

Thanks again.

Peavine

This is a bit shorter, but takes a different approach. it’s advantage, if any, is that it’s more forgiving of the input: it will accept commas, and ignore errant spaces.

use AppleScript version "2.4"
use framework "Foundation"
use scripting additions

set userInput to "2 4-6 10"
set totalPages to 10
set pagesToKeep to current application's NSMutableArray's array()

set theScanner to current application's NSScanner's scannerWithString:userInput
theScanner's setCharactersToBeSkipped:(current application's NSCharacterSet's characterSetWithCharactersInString:("," & space & tab)) -- in case commas or tabs have been used
set {theResult, lastValue} to theScanner's scanInteger:(reference)
if lastValue = 2 then
	pagesToKeep's addObject:(1 as text)
else if lastValue > 2 then
	pagesToKeep's addObject:("1-" & (lastValue - 1))
end if
repeat
	set isRange to theScanner's scanString:"-" intoString:(missing value)
	set {theResult, theValue} to theScanner's scanInteger:(reference)
	if not theResult then exit repeat
	if not isRange then
		if theValue > lastValue + 2 then
			pagesToKeep's addObject:(((lastValue + 1) as text) & "-" & (theValue - 1))
		else
			pagesToKeep's addObject:(theValue - 1)
		end if
	end if
	set lastValue to theValue
end repeat
if totalPages > lastValue then
	pagesToKeep's addObject:(((lastValue + 1) as text) & "-" & totalPages)
end if
set pdfPages to (pagesToKeep's componentsJoinedByString:",") as text

On the other hand, if it were me I’d do the whole page deletion job in the script:

use AppleScript version "2.4"
use framework "Foundation"
use framework "Quartz"
use scripting additions

set pdfPath to POSIX path of (choose file)
set userInput to "2 4-6 10"
set pagesToDelete to {}
-- build list of pages to delete
set theScanner to current application's NSScanner's scannerWithString:userInput
theScanner's setCharactersToBeSkipped:(current application's NSCharacterSet's characterSetWithCharactersInString:("," & space & tab)) -- in case commas or tabs have been used
set {theResult, theValue} to theScanner's scanInteger:(reference)
if theResult then set end of pagesToDelete to theValue
repeat
	set isRange to theScanner's scanString:"-" intoString:(missing value)
	set {theResult, theValue} to theScanner's scanInteger:(reference)
	if not theResult then exit repeat
	if isRange then
		repeat with i from (item -1 of pagesToDelete) + 1 to theValue
			set end of pagesToDelete to i
		end repeat
	else
		set end of pagesToDelete to theValue
	end if
end repeat
-- open PDF
set pdfURL to current application's |NSURL|'s fileURLWithPath:pdfPath
set theDoc to current application's PDFDocument's alloc()'s initWithURL:pdfURL
-- check enough pages
set pageCount to theDoc's pageCount()
if item -1 of pagesToDelete > pageCount then
	error "No such page in the is document"
end if
-- delete pages
repeat with pageNum in reverse of pagesToDelete
	(theDoc's removePageAtIndex:(pageNum - 1)) -- zero-based indexes
end repeat
-- save in new file
set newPath to pdfURL's |path|()'s stringByDeletingPathExtension()'s stringByAppendingString:("_copy.pdf")
theDoc's writeToFile:newPath

Then you’ll have some fun with Shane’s scripts and mine. :wink:

The vanilla methods for getting the list of pages to delete appear to be slightly faster (0.0 seconds on my iMac) than the NSScanner one (0.0001 seconds). But Shane’s code for saving an edited copy of the PDF ( :cool: ) saves faffing around with the shell script format and the time taken by ‘do shell script’ itself.

Although both do the job as outlined, they could both arguably be a bit less literal. For example, suppose the input is “2 5-7 10” – they return “1,3-4,8-9”. Arguably “1,3,4,8,9” would be more sensible. Just another if case.

That’s actually in the derivation of the “keep” string from the delete list. But it makes a nice exercise for anyone who’d prefer that style. :slight_smile:

Shane,

Thanks for the suggestions. I’ll try them out in my script.

BTW, when using an early version of my script, I intended to delete 3 pages but an errant space before a dash caused the script to delete 30 pages. I’ve corrected for that now but errant spaces are a concern.

Peavine

Shane,

You raise a point which I had wondered about. I use this script often and I will occasionally delete a few pages from a PDF that contains several hundred pages. Telling the PDF utility to keep pages 1-15 and 17-200 just seemed to make more sense than listing every single page to keep, although perhaps it makes no difference to the PDF utility.

So far I’ve tested my script, amended to include Nigel’s first suggestion, and it works well even with extremely large PDF’s. So, I’m a happy man.

Peavine

The scanner approach lets you list any characters you want to skip. It’s also pretty easy to make it recognise alternatives to a hyphen, should that be a possibility.

Well you’re just building ranges that the utility has to unpack at the other end :). That said, I think ranges generally make sense. I was just making the point that a range of two consecutive numbers, like “3-4”, doesn’t seem to offer any advantage over “3,4”.

I was curious and ran a test with my current script, which incorporates Nigel’s first suggestion, and an older version of the script, which does not convert consecutive page numbers to ranges. I used a 672-page PDF (49 MB) and deleted just one page midpoint in the PDF. Both scripts seemed to take about the same time (perhaps 1 second).

I had thought feeding the PDF utility 671 individual page numbers in a “do shell” command would slow it down significantly or perhaps cause it not to function. That wasn’t the case.

Thanks again Nigel and Shane for the help.

This post presented a fun challenge to find a bash solution (and accomplish some self-teaching in the process). The following shell script accepts spaces, tabs, and commas as number separators, and handles redundant and out-of-order entries. It takes a couple of hundredths of a second to execute and thus is not as efficient as Nigel’s or Shane’s solution:


do shell script "" & ¬
	"input=\",$(egrep -o '[0-9]+([[:blank:],]*[-][[:blank:],]*[0-9]+)?' <<<" & userInput's quoted form & " | while read s; do seq ${s%%-*} ${s#*-}; done | sort -n | uniq | tr '\\n' ,)\";" & ¬
	"prev=-1;" & ¬
	"n2=0;" & ¬
	"while read n; do" & ¬
	"	if [[ $input != *,${n},* ]]; then" & ¬
	"		if (( n == prev+1 )); then" & ¬
	"			n2=$n;" & ¬
	"		elif (( n2 > 0 )); then" & ¬
	"			output=\"$output-$n2,$n\";" & ¬
	"			n2=0;" & ¬
	"		else" & ¬
	"			[[ -n $output ]] && output=\"$output,\";" & ¬
	"			output=\"$output$n\";" & ¬
	"		fi;" & ¬
	"		prev=$n;" & ¬
	"	fi;" & ¬
	"done <<<\"$(seq " & totalPages & ")\";" & ¬
	"(( n2 > 0 )) && output=\"$output-$n2\";" & ¬
	"echo \"$output\""

-- For userInput = "2 4-6 10" and totalPages = 10 --> "1,3,7-9"
-- For userInput = "2, 6-4, 10, 5-4, 2" and totalPages = 10 --> "1,3,7-9"

If one wanted to have mischievous fun, one could even (but probably shouldn’t) condense this into a single line:


do shell script "input=\",$(egrep -o '[0-9]+([[:blank:],]*[-][[:blank:],]*[0-9]+)?' <<<" & userInput's quoted form & " | while read s; do seq ${s%%-*} ${s#*-}; done | sort -n | uniq | tr '\\n' ,)\"; prev=-1; n2=0; while read n; do if [[ $input != *,${n},* ]]; then if (( n == prev+1 )); then n2=$n; elif (( n2 > 0 )); then output=\"$output-$n2,$n\"; n2=0; else [[ -n $output ]] && output=\"$output,\"; output=\"$output$n\"; fi; prev=$n; fi; done <<<\"$(seq " & totalPages & ")\"; (( n2 > 0 )) && output=\"$output-$n2\"; echo \"$output\""

Bmose. I tried your suggestion and it worked great, It was the only one that worked with reverse page-range input (6-4 rather than 4-6). Good work. :slight_smile:

Thanks, peavine.

I realized that the script could be made a bit more condensed and efficient by using the “+=” operator in several places to append to the output variable:


do shell script "" & ¬
	"input=\",$(egrep -o '[0-9]+([[:blank:],]*[-][[:blank:],]*[0-9]+)?' <<<" & userInput's quoted form & " | while read s; do seq ${s%%-*} ${s#*-}; done | sort -n | uniq | tr '\\n' ,)\";" & ¬
	"prev=-1;" & ¬
	"n2=0;" & ¬
	"while read n; do" & ¬
	"	if [[ $input != *,${n},* ]]; then" & ¬
	"		if (( n == prev+1 )); then" & ¬
	"			n2=$n;" & ¬
	"		elif (( n2 > 0 )); then" & ¬
	"			output+=\"-$n2,$n\";" & ¬
	"			n2=0;" & ¬
	"		else" & ¬
	"			[[ -n $output ]] && output+=\",\";" & ¬
	"			output+=\"$n\";" & ¬
	"		fi;" & ¬
	"		prev=$n;" & ¬
	"	fi;" & ¬
	"done <<<\"$(seq " & totalPages & ")\";" & ¬
	"(( n2 > 0 )) && output+=\"-$n2\";" & ¬
	"echo \"$output\""