The ‘read’ command doesn’t seem to like files of that size, but ASObjC may be OK. (The test files I created are only 1.3 GB.)
This can take up to a minute or more, depending on the file size, the unit size, and the speed of your computer:
use AppleScript version "2.4" -- Yosemite (10.10) or later
use framework "Foundation"
use scripting additions
property maxUnitsPerFile : 100000
on splitTMXFile(TMXfile)
-- Original assumptions: The TMX file is UTF-8 encoded and consists entirely of an initial block (which is to be reproduced in all the smaller files) ending with a "<body>" tag, a large number of "<tu …> … </tu>" entries, and closing "</body>" and "</tmx>" tags.
-- Modified assumptions: The TMX text may or may not be UTF-8 encoded and "unit" sections may actually begin with straight "<tu>" tags instead of the "<tu …(+ other data)…>" type. The source file is now read using a method which works out the text encoding for itself and the split text is saved with that encoding.
set |⌘| to current application
-- Read the file.
set originalPath to |⌘|'s class "NSString"'s stringWithString:(POSIX path of TMXfile)
-- set UTF8 to |⌘|'s NSUTF8StringEncoding
set {originalText, originalEncoding} to |⌘|'s class "NSString"'s stringWithContentsOfFile:(originalPath) usedEncoding:(reference) |error|:(missing value)
-- Find where the "units" start.
set unitsStart to (originalText's rangeOfString:("<tu[ >]") options:(|⌘|'s NSRegularExpressionSearch))'s location()
-- Get the text up to there and the two end tags.
set initialBlock to originalText's substringWithRange:({0, unitsStart})
set endBlock to |⌘|'s class "NSString"'s stringWithString:("</body>" & linefeed & "</tmx>")
-- Set up and use a regex to match the "units" in blocks of up to the maximum number required per file.
set unitsBlockRegex to |⌘|'s class "NSRegularExpression"'s regularExpressionWithPattern:("(?:<tu[ >](?:[^>]++(?<!</tu)>)++[^>]++(?<=</tu)>[^<]*+){1," & maxUnitsPerFile & "}+") options:(0) |error|:(missing value)
set unitsBlockMatches to unitsBlockRegex's matchesInString:(originalText) options:(0) range:({unitsStart, (originalText's |length|()) - unitsStart})
-- Get the original file path without the extension to use as the basis for the file paths to be created.
set rootPath to originalPath's stringByDeletingPathExtension()
-- Work out how many digits will be required in the numeric suffixes to be included in the file names.
set newFilesNeeded to (count unitsBlockMatches)
set suffixLength to (count (newFilesNeeded as text))
set n to (10 ^ suffixLength) as integer
-- Work through the "unit" block matches.
repeat with i from 1 to newFilesNeeded
-- Create a new text consisting of the initial block, the current matched block of "units", and the end tags.
set thisUnitsBlockMatch to item i of unitsBlockMatches
set thisUnitsBlock to (originalText's substringWithRange:(thisUnitsBlockMatch's range()))
set newText to initialBlock's stringByAppendingFormat_("%@%@", thisUnitsBlock, endBlock)
-- Put together a path with an appropriate numeric suffix before the extension and save the text to it.
set newPath to (rootPath's stringByAppendingString:(" [" & text 2 thru -1 of (n + i as text) & "].tmx"))
tell newText to writeToFile:(newPath) atomically:(true) encoding:(originalEncoding) |error|:(missing value)
end repeat
end splitTMXFile
set TMXfile to (choose file of type {"tmx"} with prompt "Choose a gigantic TMX file …")
splitTMXFile(TMXfile)
Edit: Script modified to make it more flexible in what it recognises.
OK. That particular file differs from the others in that it’s UTF-16LE encoded and has plain “” tags instead of the “<tu …(+ other data)…>” variety. I’ve modified the script above to allow for these differences. It saves the extracted text with same encoding as the original — or at least with what’s deduced to be the original encoding by the method now used to read the file.