The testbed worked good!
I am now to start with the actual parsing of the html, transposing it into a csv table, under pretty much the conditions I stated for starters. that the OP will have to clean up the csv later on, by deleting rows and columns, Which isn’t that hard when you do it in Excel anyway.
It has struck me in the mean time, that this might work well, as an abstraction tool, for people that wants to get information off web pages on a regular basis, without having to resort to get the data out of the DOM tree.
The rinsing of the html for uneccessary tags is now implemented, to make the actual parsing and transposing easier, which is the next step.
I got some handlers by taking this prelimnary step, that I can reuse later. As for the “tag-Datastructures”, no final decision are taken about those yet, I have just implemented something that works for the moment, which I think I may optimize as much as I can by spezialized handlers later on.
# 1: We're implementing the rinse html handler
# We do this up front, or we'll have to drag it with us during the parsing, which is
# complicating matters, and making things go slower.
property tlvl : me
# The main handler for parsing html into a csv table
script html2csv
property parent : AppleScript
property scriptTitle : "Html2Csv"
property tagCategories : {"singleLine", "discardables", "edibles", "digestives"}
property _singleLineTags : {{"<br />"}, {"<hr />"}, {"<embed/>"}}
property singleLineTags : missing value # single list countepart, for tests.
property _discardables : {{"<!--;", "-->"}, {"<pre;", "</pre>"}, {"<code;", "</code>"}, {"<object;", "</object>"}, {"<form;", "</form>"}, {"<script;", "</script>"}, {"<embed;", "</embed>"}, {"<b;", "</b>"}, {"<i;", "</i>"}, {"<u;", "</u>"}, {"<small;", "</small>"}, {"<strong;", "</strong>"}, {"<strike;", "</strike>"}, {"<em;", "</em>"}, {"<span;", "</span>"}, {"<big;"}, {"</big>"}, {"<aside;", "</aside>"}, {"<footer;", "</footer>"}}
property discardables : missing value
# those tags are really ignored, so we can use them in their full form.
# This may lead to bugs, if there are classes or id or other attributes
# set on those, but this is nothing I see as a problem, since it will really
# just be ignored, and we have no semantic use of classes or id's inside those
# tags anyway.
property _edibles : {{"<div;", "</div>"}, {"<nav>", "</nav>"}, {"<section;", "</section>"}, {"<ul;", "</ul>"}, {"<ol;", "</ol>"}}
property edibles : missing value
property _digestives : {{"<p;", "</p>"}, {"<li;", "</li"}, {"<a;", "</a>"}, {"<image;", "</image>"}}
property digestives : missing value
property __tagDictsInited : missing value
on run
local theCsvTable, theFileContents, fna, e, n
try
set {fna, theCsvTable} to {choose file, {}}
set theFileContents to tlvl's readFileAsUtf8(fna)
# preparation of the data removing of surrounding body block
local startpos, endpos
set startpos to closureposOfATag for theFileContents by "<body"
set endpos to offset of "</body>" in theFileContents
set theFileContents to text startpos thru (endpos - 1) of theFileContents
# The actual parsing takes place here, will use prev defined recursion.
set theCsvTable to parse(theFileContents, theCsvTable) # in progress
# write the table to disk
tlvl's writeToFileAsUtf8((fna as text) & ".csv", theCsvTable)
on error e number n
local cr, sep, errmsg
-- Chris Stone
set {cr, sep} to {return, "------------------------------------------"}
set errmsg to sep & cr & "Error: " & e & cr & sep & cr & "Error
Number: " & n & cr & sep
try
tell application "SystemUIServer"
activate
display dialog errmsg with title scriptTitle buttons {"Ok"} default button 1
end tell
end try
end try
end run
to closureposOfATag for aText by startOfAtag
# text is presumed to start at pos 1
local startTagPos, endTagPos, cl, hyphCount
set startTagPos to offset of startOfAtag in aText
if startTagPos = 0 then error "closureposOfATag : Missing Tag" number 3099
set {cl, hyphCount} to {(every character of aText), 0}
repeat with i from (startTagPos + (length of startOfAtag)) to (length of aText)
if item i of cl = "\"" then
set hyphCount to hyphCount + 1
else if item i of cl = ">" and hyphCount mod 2 = 0 then
return (i + 1)
end if
end repeat
end closureposOfATag
to nextTagHead(startpos, htmlText) # returns head of tag, and startpos of it.
# we don't consider hyphens, as we should really not meet a hyphen before we reach
# an end of a tag.
local startTagPos, endTagPos, cl, hyphCount
set startTagPos to offset of "<" in (text startpos thru -1 of htmlText)
if startTagPos = 0 then return null # We're done.
# special consdiderations for comment tags"
if text (startpos + startTagPos - 1) thru (startpos + startTagPos + 2) of htmlText = "<!--" then
set startOffset to (startpos + startTagPos - 1)
return {(text startOffset thru (startOffset + 3) of htmlText) & ";", startOffset}
else
set startOffset to (startpos + startTagPos - 1)
set endTagPos to offset of ">" in (text startOffset thru -1 of htmlText)
set spacePos to offset of space in (text startOffset thru -1 of htmlText)
if endTagPos < spacePos then
return {(text startOffset thru (startOffset + endTagPos - 2) of htmlText) & ";", startOffset}
else
return {(text startOffset thru (startOffset + spacePos - 2) of htmlText) & ";", startOffset}
end if
end if
end nextTagHead
to rinseHtml(htmlText)
# removes as much as possible of stuff we really don't need to deal with.
# this routine lends itself to the usage of a stack and a record,
# to make it easy for us, to truncate away the unnecessary tags.
# find next tag in the text, decide if the head of it is among the discardables
# if it is, keep the items, and find the end of it
# push the start and end posistion down, onto a stack.
local spos, discardsStack
set discardsStack to tlvl's Stack's makeStack()
set spos to 1
repeat
set theTagHeadData to nextTagHead(spos, htmlText)
if theTagHeadData is not null then
set spos to item 2 of theTagHeadData
set thetaghead to item 1 of theTagHeadData
else
exit repeat
end if
if ismember(thetaghead, discardables) then
set tagtail to tagCounterPart(thetaghead, discardables)
set endpos to findTagEndPos((spos + (length of thetaghead)), tagtail, htmlText)
discardsStack's Push({spos, endpos})
set spos to endpos
else if ismember(thetaghead, singleLineTags) then
set endpos to spos + 2
discardsStack's Push({spos, endpos})
set spos to endpos
else
set spos to spos + 2
end if
end repeat
# We'll contract the text from behind - forwards
local pos
set pos to discardsStack's Pop()
repeat while pos is not {}
set htmlText to text 1 thru ((item 1 of pos) - 1) of htmlText & text ((item 2 of pos) + 1) thru -1 of htmlText
set pos to discardsStack's Pop()
end repeat
return htmlText
end rinseHtml
to findTagEndPos(startpos, tagtail, htmlText)
local startTagPos, endTagPos, cl, hyphCount
set startTagPos to offset of tagtail in (text startpos thru -1 of htmlText)
if startTagPos = 0 then error "findTagEndPos : Missing Tag" number 3099
set startOffset to (startpos + startTagPos - 1)
set endTagPos to (length of tagtail) + startOffset - 1
return endTagPos
end findTagEndPos
to tagCounterPart(thetaghead, theTagSet)
local linenumber
set linenumber to tlvl's indexOfItem(thetaghead, theTagSet)
return item (linenumber + 1) of theTagSet
end tagCounterPart
to parse(utf8HtmlText, csvTable) # work in progress
local washedHtml, curLine
if __tagDictsInited is missing value then initSetsOfTags()
-- return utf8HtmlText
set {washedHtmlText, curLine} to {rinseHtml(utf8HtmlText), {}}
return 0 # Not further
parse2csv for washedHtmlText against csvTable by curLine
# convert csvTable to text - for this to be easy, it would be smart to use text fields for starters.
# just adding return as line-endings!
return csvTable
end parse
to ismember(theTag, theSet)
# flattens the list into one long thing, good to search within
# I believe this will do the trick whatever the nesting.
if theSet contains theTag then
return true
else
return false
end if
end ismember
to itsSet(theTag)
if ismember(theTag, singleLineTags) then
return "singleLine"
else if ismember(theTag, discardables) then
return "discardables"
else if ismember(theTag, edibles) then
return "edibles"
else if ismember(theTag, digestives) then
return "digestives"
else
error number 3399
end if
end itsSet
to initSetsOfTags()
if singleLineTags is missing value then
set singleLineTags to aFlatList of tlvl by _singleLineTags
end if
if discardables is missing value then
set discardables to aFlatList of tlvl by _discardables
end if
if edibles is missing value then
set edibles to aFlatList of tlvl by _edibles
end if
if digestives is missing value then
set digestives to aFlatList of tlvl by _digestives
end if
set __tagDictsInited to true
end initSetsOfTags
end script
############
tell html2csv to run
############
on writeToFileAsUtf8(fname, stuff)
local fref, e, n, fsz
set fref to open for access fname with write permission
try
write stuff to fref as «class utf8» starting at 0
set fsz to get eof fref
close access fref
return fsz
on error e number n
try
close access fref
end try
error "writeToFileAsUtf8 " & e number n
end try
end writeToFileAsUtf8
to readFileAsUtf8(alisForTheFile)
local fcontents, ftr, e, n
try
set ftr to (open for access (alisForTheFile))
# her stjeler vi hva vi kan av Yvan koenig, når det gjelder uti'er og csv.
on error e number n
error "readFileAsUtf8 " & e number n
end try
try
set fcontents to (read ftr as «class utf8»)
on error e number n
close access ftr
error "readFileAsUtf8 " & e number n
end try
--close the file
try
close access ftr
on error e number n
close access ftr
error "readFileAsUtf8 " & e number n
end try
return fcontents
end readFileAsUtf8
to aFlatList by nestedList
local tids, flatlist
set {tids, AppleScript's text item delimiters} to {AppleScript's text item delimiters, return}
set flatlist to text items of (nestedList as text)
set AppleScript's text item delimiters to tids
return flatlist
end aFlatList
on indexOfItem(theItem, itemsList) -- credit to Emmanuel Levy
local rs
set text item delimiters to return
set itemsList to return & itemsList & return
set text item delimiters to {""}
try
set rs to -1 + (count (paragraphs of (text 1 thru (offset of (return & theItem & return) in itemsList) of itemsList)))
on error
return 0
end try
rs
end indexOfItem
on getSingelton(the_list, item_a)
set astid to AppleScript's text item delimiters
-- Nigel Garvey's with a name change
set AppleScript's text item delimiters to return
set the_list_as_string to return & the_list & return
set AppleScript's text item delimiters to return & item_a & return
if (the_list_as_string contains result) then
set p to (count paragraphs of text item 1 of the_list_as_string)
if (p is 0) then set p to 1 -- Catch modern paragraph count for empty text.
set p to p mod 2
try
set otherItem to paragraph (p * 2 - 1) of text item (p + 1) of the_list_as_string
on error
return null
end try
set AppleScript's text item delimiters to astid
return otherItem
else
return null
end if
end getSingelton
script Stack
property parent : AppleScript
property __stack : missing value
on init()
set my __stack to {}
return me
end init
on Push(athing)
set my __stack to {athing} & my __stack
end Push
on Pop()
local athing
try
set athing to first item of my __stack
on error
return {}
end try
set my __stack to rest of my __stack
return athing
end Pop
on Peek() # For debugging
local athing
try
set athing to item 1 of my __stack
on error
return {}
end try
return athing
end Peek
to makeStack()
script Stack
property parent : tlvl's Stack # For getting at the parent
property __stack : missing value # For a unique stack!
end script
return Stack's init() # My instance.
end makeStack
end script