Search for Text in PDF and extract pages

hi there

I wonder if there’s a scriptable way to search in a pdf for a text and when the text was found, the page(s) should be exported as singe pdfs.

Is this even possible?

thanks in advance for your help
kind regerds

Marth

Here is a modified version of a Shane STANLEY’s script supposed to fit your needs.


use AppleScript version "2.3.1"
use scripting additions
use framework "Foundation"
use framework "Quartz" -- required for PDF stuff

--property theKey : ""

#===== Handlers

-- Supposed to create a new PDF file for every page from the passed PDF file which contain the key string.

on splitPDF:thePath forKey:theKey
	
	set inNSURL to current application's |NSURL|'s fileURLWithPath:thePath
	
	set thePDFDocument to current application's PDFDocument's alloc()'s initWithURL:inNSURL
	# CAUTION. theList contain indexes of pages numbered starting from 1, but ASObjC number them starting from 0
	set theCount to thePDFDocument's pageCount() as integer
	repeat with i from 1 to theCount
		set thePDFPage to (thePDFDocument's pageAtIndex:(i - 1)) # ?????
		set itsText to (thePDFPage's |string|()) as text
		if itsText contains theKey then
			
			set newPath to (its addString:("-page " & text -2 thru -1 of ((100 + i) as text)) beforeExtensionIn:thePath)
			set outNSURL to (current application's |NSURL|'s fileURLWithPath:newPath)
			set newPDFDoc to current application's PDFDocument's alloc()'s init()
			(newPDFDoc's insertPage:thePDFPage atIndex:0)
			(newPDFDoc's writeToURL:outNSURL)
		end if
	end repeat
end splitPDF:forKey:

-- inserts a string in a path before the extension
on addString:extraString beforeExtensionIn:aPath
	set pathNSString to current application's NSString's stringWithString:aPath
	set newNSString to current application's NSString's stringWithFormat_("%@%@.%@", pathNSString's stringByDeletingPathExtension(), extraString, pathNSString's pathExtension())
	return newNSString as text
end addString:beforeExtensionIn:

#===== Caller

set theKey to text returned of (display dialog "Enter the key to search for:" default answer "Manang Saling")

set thePath to POSIX path of (choose file with prompt "Choose a PDF file." of type {"PDF"})
its splitPDF:thePath forKey:theKey

You will be urged to enter the key string to search and to select the file to search into.

Yvan KOENIG running High Sierra 10.13.6 in French (VALLAURIS, France) lundi 4 novembre 2019 14:00:02

How about this?


-- Created 2017-06-18 by Takaaki Naganoya
-- 2017 Piyomaru Software
use AppleScript version "2.4"
use scripting additions
use framework "Foundation"
use framework "Quartz"
use bPlus : script "BridgePlus"

--Keywords (accept fluctuations)
set sList to {"Piyomaru Software", "PIYOMARU Soft"} --considering case

set thePath to POSIX path of (choose file of type {"com.adobe.pdf"})

set aRes to findWordListInPDFContents(thePath, sList) of me
--> {1, 3, 4, 71, 72, 75, 95, 96, 97, 98, 420, 429, 479, 483}--hit page numbers list


on findWordListInPDFContents(thePOSIXPath as string, sList as list)
	script spdPDF
		property textCache : missing value
		property aList : {}
	end script
	
	--Make Text Search Cache from a PDF
	set anNSURL to (current application's |NSURL|'s fileURLWithPath:thePOSIXPath)
	set theDoc to current application's PDFDocument's alloc()'s initWithURL:anNSURL
	set theCount to theDoc's pageCount() as integer
	
	set (textCache of spdPDF) to current application's NSMutableArray's new()
	
	repeat with i from 0 to (theCount - 1)
		set aPage to (theDoc's pageAtIndex:i)
		set tmpStr to (aPage's |string|())
		((textCache of spdPDF)'s addObject:{pageIndex:i + 1, pageString:tmpStr})
	end repeat
	
	
	--Search for text cache
	repeat with s in sList
		
		--❶Partial match search
		set bRes to ((my filterRecListByLabel1((textCache of spdPDF), "pageString contains '" & s & "'"))'s pageIndex) as list
		
		--❷、❶Search keywords lie on multiple pages
		if bRes = {} then
			set bRes to {}
			set theSels to (theDoc's findString:s withOptions:0)
			repeat with aSel in theSels
				set thePage to (aSel's pages()'s objectAtIndex:0)'s label()
				set curPage to (thePage as integer)
				if curPage is not in bRes then
					set the end of bRes to curPage
				end if
			end repeat
		end if
		
		set the end of (aList of spdPDF) to bRes
		
	end repeat
	
	--2D list to 1D list conversion (Flatten)
	load framework
	set bList to (current application's SMSForder's arrayByFlattening:(aList of spdPDF)) as list
	
	--Uniquefy
	set cList to uniquifyList(bList) of me
	
	--Sort 1D List
	set anArray to current application's NSArray's arrayWithArray:cList
	set sortRes1 to (anArray's sortedArrayUsingSelector:"compare:") as list
	
	
	set (textCache of spdPDF) to "" --Purge
	set (aList of spdPDF) to {} --Purge
	
	return sortRes1
end findWordListInPDFContents


on filterRecListByLabel1(aRecList as list, aPredicate as string)
	set aArray to current application's NSArray's arrayWithArray:aRecList
	set aPredicate to current application's NSPredicate's predicateWithFormat:aPredicate
	set filteredArray to aArray's filteredArrayUsingPredicate:aPredicate
	return filteredArray
end filterRecListByLabel1


on uniquifyList(aList as list)
	set aArray to current application's NSArray's arrayWithArray:aList
	set bArray to aArray's valueForKeyPath:"@distinctUnionOfObjects.self"
	return bArray as list
end uniquifyList


Model: MacBook Pro 2012
AppleScript: 2.7
Browser: Safari 13.0.1
Operating System: macOS 10.14

Thank you very much guys!