Returns sentences or clauses from a par., considers abbrev and acronym

Hello.

This is for fun but can be useful now and then! :smiley: Like if you want to compute the readability index, or format every start of a sentence with some markup of some kind.

It should support most western languages well, as I use Satimage.osax for detecting uppercase, well, it is at least not lesser than most uppercase functions.

This is not made performance wise, not before I have the need for it, which I don’t see happen anytime soon really. :slight_smile:

Caveats

Doesn’t work well with uppercase figure denotations, like fig. A, this holds for footnotes, and endnotes as well, should you use an abbreviation in front of it.
The work around is to either lowercase the denotations, or spell it out fully: figure A.

Edit
Removed Norwegian comments, and unnecessary log statements.


” This code is ©McUsr 2012, you are not allowed to post it standalone elsewhere without permission, but you may use it as a part of your own work. You preferable post that here as well!

property scripttitle : "everySentence Driver"

on run
	global nonAlnums
	set nonAlnums to {" ", "	", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "-", "_", "=", "+", "[", "]", "}", ";", ":", "'", ",", "<", ".", ">", "/", "?", "`", "~", ".", "«", "»", "“", "”", "š", "ÂŽ", "|", "\\", "§"}
	
	set AppleScript's text item delimiters to ""
	set {fullSentences, allClauses} to {{}, {}}
	
	set thetext to "Here's to the crazy ones. This particular sample is illustrated by fig. 1, it clearly shows the correlation between figs and almonds as christmas snacks. The rebels? The troublemakers. The round pegs in the square holes. It all happened around 200 b.c. The ones who see things differently! They're not fond of rules. And they have no respect for the status quo! You can quote them, disagree with them, glorify or vilify them. One of the most infamous acronyms, to my knowledge is d.i.y."
	
	if not (checkforOsax by "Satimage.osax" against "This script requires satimage.osax You can download Satimage.osax from here:" from "http://www.satimage.fr/software/en/downloads/downloads_companion_osaxen.html" for my scripttitle) then
		tell application "Finder"
			open folder (path to scripting additions folder from local domain)
			open folder (path to downloads folder)
			activate
		end tell
		error number -128
	end if
	
	set thePars to every paragraph of thetext
	
	repeat with aPar in thePars
		
		set end of allClauses to everySentence from aPar with clauses
		-- 		set end of fullSentences to everySentence from aPar without clauses
	end repeat
	log "done"
end run


to everySentence from aParagraph given clauses:clauses
	local tt, ct, ofa, ofb, ofc, ofs, factor, isAbbrev, rparagraph, tmp_ofs, ofs_saved, aSentence, Sentences, revSentence, aClause, theClauses
	
	set tt to {}
	
	set rparagraph to reverse of every character of aParagraph as text
	set ct to count rparagraph
	set ofa to offset of "." in rparagraph
	set ofb to offset of "!" in rparagraph
	set ofc to offset of "?" in rparagraph
	set ofs to min3({ofa, ofb, ofc}, {".", "!", "?"})
	set end of tt to {(ct - ofs + 1)}
	set factor to 0
	set isAbbrev to false
	set ofs_saved to 0
	
	repeat
		
		if isAbbrev then set isAbbrev to false
		
		
		set ofa to offset of "." in (characters (ofs + 1) through -1 of rparagraph as text)
		set ofb to offset of "!" in (characters (ofs + 1) through -1 of rparagraph as text)
		set ofc to offset of "?" in (characters (ofs + 1) through -1 of rparagraph as text)
		
		set tmp_ofs to min3({ofa, ofb, ofc})
		
		-- if the period, is at the end of an abbrev or acronym then 
		-- we must set the offset aside, until we have found the end of it
		
		if tmp_ofs = ofa then -- check for abbreviation
			if character (tmp_ofs + ofs - 1) of rparagraph as text = space then
				if not chIsntAlnum(character (tmp_ofs + ofs - 2) of rparagraph) then
					if not isnumber(character (tmp_ofs + ofs - 2) of rparagraph) then
						
						if not isUCAS(character (tmp_ofs + ofs - 2) of rparagraph) then
							--	we haven't got at an end of period marker 
							set isAbbrev to true
						else
							set isAbbrev to false
						end if
					else
						set isAbbrev to true
					end if
				else
					set isAbbrev to true
				end if
			else
				set isAbbrev to true
			end if
			-- an abbreviation is broken by a space, and an uppercase char.
		end if
		
		if tmp_ofs is 0 then exit repeat -- we're done
		
		
		if not isAbbrev then
			set ofs to ofs_saved + tmp_ofs
			set factor to factor + ofs + 1
			set ofs to factor
			set end of tt to {ct - factor + 2}
			set ofs_saved to 0
		else
			set ofs_saved to ofs_saved + tmp_ofs
			set ofs to ofs + tmp_ofs
		end if
	end repeat
	
	set end of tt to {0}
	
	-- end parsing a paragraph into sentences, we'll now construct sentences by the
	-- offsets acquired.
	
	set tt to reverse of tt
	set Sentences to {}
	repeat with i from 1 to ((get count tt) - 1)
		
		set aSentence to characters (((item i of tt) + 1) as number) thru ((item ((i + 1)) of tt) as number) of aParagraph as text
		copy aSentence to end of Sentences
	end repeat
	
	if not clauses then return Sentences
	
	set theClauses to {}
	repeat with aSentence in Sentences
		set ofs to 0
		set factor to 0
		set tt to {}
		set ct to (get count aSentence)
		set revSentence to reverse of every character of aSentence as text
		repeat
			set ofa to offset of "," in (characters (ofs + 1) through -1 of revSentence as text)
			set ofb to offset of ";" in (characters (ofs + 1) through -1 of revSentence as text)
			set ofc to offset of ":" in (characters (ofs + 1) through -1 of revSentence as text)
			set ofs to min3({ofa, ofb, ofc})
			
			if ofs is 0 then exit repeat
			
			set factor to factor + ofs + 1
			set ofs to factor
			set end of tt to {ct - factor + 2}
		end repeat
		
		if tt is not {} then
			set tt to {ct} & tt
			set end of tt to {0}
			set tt to reverse of tt
			
			
			repeat with i from 1 to ((get count tt) - 1)
				
				set aClause to characters (((item i of tt) + 1) as number) thru ((item ((i + 1)) of tt) as number) of aSentence as text
				copy aClause to end of theClauses
			end repeat
		else -- no clauses
			copy contents of aSentence to end of theClauses
		end if
	end repeat
	return theClauses
end everySentence

to chIsntAlnum(ach)
	global nonAlnums
	if first character of ach is not in nonAlnums then return false
	return true
end chIsntAlnum

on isnumber(aStr)
	try
		aStr as number
		return true
	on error
		return false
	end try
end isnumber

on isUCAS(ch) -- Satimage.osax
	considering case
		if first character of ch = (uppercase ch) then return true
	end considering
	return false
end isUCAS

on min(a, b)
	if a < b then
		return a
	else
		return b
	end if
end min

on min3(l)
	-- returns least number above zero
	local a, b, c, d, e, f, oka, okb, okc
	set {a, b, c} to {item 1 of l, item 2 of l, item 3 of l}
	set {oka, okb, okc} to {false, false, false}
	if a > 0 then set oka to true
	if b > 0 then set okb to true
	if c > 0 then set okc to true
	
	if oka and okb and okc then
		set d to min(a, b)
		set e to min(b, c)
		set f to min(d, e)
		return f
	else if oka and okb then
		set d to min(a, b)
		return d
	else if oka and okc then
		set d to min(a, c)
		return d
	else if okb and okc then
		set d to min(b, c)
		return d
	else if oka then
		return a
	else if okb then
		return b
	else if okc then
		return c
	else
		return 0
	end if
end min3

to checkforOsax by OsaxName against eMsg from dlUrl for scripttitle
	-- http://macscripter.net/viewtopic.php?id=39190
	local localFol, userFol, found, tBt, go
	set localFol to path to scripting additions folder from local domain as text
	set userFol to path to scripting additions folder from user domain as text
	
	tell application "System Events"
		set found to (exists file (localFol & OsaxName))
		if not found then set found to (exists file (userFol & OsaxName))
	end tell
	if not found then
		set go to false
		try
			tell application "SystemUIServer"
				activate
				
				set tBt to button returned of (display dialog eMsg with title scripttitle default answer dlUrl buttons {"Go", "Ok"} cancel button 2 default button 1 with icon 2)
				set go to (tBt = "Go")
				
			end tell
		end try
		if go then tell application "Safari"
			activate
			open location dlUrl
		end tell
		return false
	else
		return true
	end if
	
end checkforOsax