Search for Text in PDF and extract pages

hi there

I wonder if there's a scriptable way to search in a pdf for a text and when the text was found, the page(s) should be exported as singe pdfs.

Is this even possible?

thanks in advance for your help
Re: Search for Text in PDF and extract pages

Here is a modified version of a Shane STANLEY's script supposed to fit your needs.


use AppleScript version "2.3.1"
use scripting additions
use framework "Foundation"
use framework "Quartz" -- required for PDF stuff

--property theKey : ""

#===== Handlers

-- Supposed to create a new PDF file for every page from the passed PDF file which contain the key string.

on splitPDF:thePath forKey:theKey
   set inNSURL to current application's |NSURL|'s fileURLWithPath:thePath
   set thePDFDocument to current application's PDFDocument's alloc()'s initWithURL:inNSURL
   # CAUTION. theList contain indexes of pages numbered starting from 1, but ASObjC number them starting from 0
   set theCount to thePDFDocument's pageCount() as integer
   repeat with i from 1 to theCount
       set thePDFPage to (thePDFDocument's pageAtIndex:(i - 1)) # ?????
       set itsText to (thePDFPage's |string|()) as text
       if itsText contains theKey then
           set newPath to (its addString:("-page " & text -2 thru -1 of ((100 + i) as text)) beforeExtensionIn:thePath)
           set outNSURL to (current application's |NSURL|'s fileURLWithPath:newPath)
           set newPDFDoc to current application's PDFDocument's alloc()'s init()
           (newPDFDoc's insertPage:thePDFPage atIndex:0)
           (newPDFDoc's writeToURL:outNSURL)
       end if
   end repeat
end splitPDF:forKey:

-- inserts a string in a path before the extension
on addString:extraString beforeExtensionIn:aPath
   set pathNSString to current application's NSString's stringWithString:aPath
   set newNSString to current application's NSString's stringWithFormat_("%@%@.%@", pathNSString's stringByDeletingPathExtension(), extraString, pathNSString's pathExtension())
   return newNSString as text
end addString:beforeExtensionIn:

#===== Caller

set theKey to text returned of (display dialog "Enter the key to search for:" default answer "Manang Saling")

set thePath to POSIX path of (choose file with prompt "Choose a PDF file." of type {"PDF"})
its splitPDF:thePath forKey:theKey

You will be urged to enter the key string to search and to select the file to search into.

Yvan KOENIG running High Sierra 10.13.6 in French (VALLAURIS, France) lundi 4 novembre 2019  14:00:02

Re: Search for Text in PDF and extract pages

How about this?


-- Created 2017-06-18 by Takaaki Naganoya
-- 2017 Piyomaru Software
use AppleScript version "2.4"
use scripting additions
use framework "Foundation"
use framework "Quartz"
use bPlus : script "BridgePlus"

--Keywords (accept fluctuations)
set sList to {"Piyomaru Software", "PIYOMARU Soft"} --considering case

set thePath to POSIX path of (choose file of type {"com.adobe.pdf"})

set aRes to findWordListInPDFContents(thePath, sList) of me
--> {1, 3, 4, 71, 72, 75, 95, 96, 97, 98, 420, 429, 479, 483}--hit page numbers list

on findWordListInPDFContents(thePOSIXPath as string, sList as list)
   script spdPDF
       property textCache : missing value
       property aList : {}
   end script
   --Make Text Search Cache from a PDF
   set anNSURL to (current application's |NSURL|'s fileURLWithPath:thePOSIXPath)
   set theDoc to current application's PDFDocument's alloc()'s initWithURL:anNSURL
   set theCount to theDoc's pageCount() as integer
   set (textCache of spdPDF) to current application's NSMutableArray's new()
   repeat with i from 0 to (theCount - 1)
       set aPage to (theDoc's pageAtIndex:i)
       set tmpStr to (aPage's |string|())
       ((textCache of spdPDF)'s addObject:{pageIndex:i + 1, pageString:tmpStr})
   end repeat
   --Search for text cache
   repeat with s in sList
       --❶Partial match search
       set bRes to ((my filterRecListByLabel1((textCache of spdPDF), "pageString contains '" & s & "'"))'s pageIndex) as list
       --❷、❶Search keywords lie on multiple pages
       if bRes = {} then
           set bRes to {}
           set theSels to (theDoc's findString:s withOptions:0)
           repeat with aSel in theSels
               set thePage to (aSel's pages()'s objectAtIndex:0)'s label()
               set curPage to (thePage as integer)
               if curPage is not in bRes then
                   set the end of bRes to curPage
               end if
           end repeat
       end if
       set the end of (aList of spdPDF) to bRes
   end repeat
   --2D list to 1D list conversion (Flatten)
   load framework
   set bList to (current application's SMSForder's arrayByFlattening:(aList of spdPDF)) as list
   set cList to uniquifyList(bList) of me
   --Sort 1D List
   set anArray to current application's NSArray's arrayWithArray:cList
   set sortRes1 to (anArray's sortedArrayUsingSelector:"compare:") as list
   set (textCache of spdPDF) to "" --Purge
   set (aList of spdPDF) to {} --Purge
   return sortRes1
end findWordListInPDFContents

on filterRecListByLabel1(aRecList as list, aPredicate as string)
   set aArray to current application's NSArray's arrayWithArray:aRecList
   set aPredicate to current application's NSPredicate's predicateWithFormat:aPredicate
   set filteredArray to aArray's filteredArrayUsingPredicate:aPredicate
   return filteredArray
end filterRecListByLabel1

on uniquifyList(aList as list)
   set aArray to current application's NSArray's arrayWithArray:aList
   set bArray to aArray's valueForKeyPath:"@distinctUnionOfObjects.self"
   return bArray as list
end uniquifyList

I wrote thousands of AppleScript to realize my idea. Natural language interface, voice recognition commander and so on. Though my mother toungue is strange language, Japanese, my most frequently write language is AppleScript. I believe it is for making things easy and powerful.

Re: Search for Text in PDF and extract pages

Thank you very much guys!



