Created
January 24, 2022 19:28
-
-
Save chrispahm/c226cca151b25147869288600151a5f8 to your computer and use it in GitHub Desktop.
Stream GeoJSON file and get startByte and endByte of each JSON record in the file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { open } from 'fs/promises' | |
import { Buffer } from 'buffer' | |
const HIGHWATERMARK = 64 * 1024 / 8 | |
function getFeaturePositionsInFile(fd) { | |
return new Promise((resolve,reject) => { | |
const featurePositionsInFile = [] | |
const stream = fd.createReadStream({highWaterMark: HIGHWATERMARK, autoClose: false}); | |
// this RegEx will solely work with standard GeoJSON without any foreign members: | |
// https://datatracker.ietf.org/doc/html/rfc7946#section-6.1 | |
// The properties object has to be present, and has to be that last key in the GeoJSON object | |
const jsonExtractor = /\{[\n\r\s]*?"type":[\n\r\s]*?"Feature"[\S\s]*?\}(?:[\n\r\s]*\})+/g | |
let string = '' | |
let endPos = 0 | |
stream.on('data', (d) => { | |
const section = d.toString() | |
const sectionLength = (new TextEncoder().encode(section)).length | |
string += section | |
endPos+= sectionLength | |
let match | |
let latestEndPositionInString = 0 | |
while ((match = jsonExtractor.exec(string)) != null) { | |
const startPositionInString = match.index | |
const featureString = match[0] | |
const endPositionInString = startPositionInString + featureString.length | |
const curStringLength = (new TextEncoder().encode(string)).length | |
// calculate starting position in file | |
const startPosition = endPos - curStringLength + startPositionInString | |
// calculate number of bytes in feature | |
const byteLength = (new TextEncoder().encode(featureString)).length | |
// store info for later in our lookup array | |
featurePositionsInFile.push({ | |
startPosition, | |
byteLength | |
}) | |
if (endPositionInString > latestEndPositionInString) { | |
latestEndPositionInString = endPositionInString | |
} | |
} | |
// remove features from string to free memory | |
string = string.substring(latestEndPositionInString) | |
}) | |
stream.on('end', () => resolve(featurePositionsInFile)) | |
stream.on('error', () => reject) | |
}) | |
} | |
function readSingleFeatureFromFile(fd, startPosition, length) { | |
return new Promise(async (resolve, reject) => { | |
try { | |
const buff = Buffer.alloc(length) | |
const offset = 0 | |
const { buffer } = await fd.read(buff, offset, length, startPosition) | |
const featureString = buffer.toString() | |
const singleFeature = JSON.parse(featureString) | |
resolve(singleFeature) | |
} catch (e) { | |
reject(e) | |
} | |
}) | |
} | |
async function getFeature(featureIndexToRead, featurePositionsInFile) { | |
const { startPosition, byteLength } = featurePositionsInFile[featureIndexToRead] | |
const singleFeature = await readSingleFeatureFromFile(fd, startPosition, byteLength) | |
return singleFeature | |
} | |
// source: https://raw.githubusercontent.com/node-geojson/geojson-stream/master/test/data/featurecollection.geojson | |
const path = 'featurecollection.geojson' | |
// -> has 3 features | |
const fd = await open(path, 'r'); | |
const featurePositionsInFile = await getFeaturePositionsInFile(fd) | |
// get nth (e.g 3rd) feature in file | |
const firstFeature = await getFeature(2, featurePositionsInFile) | |
console.log(firstFeature) | |
// done! make sure to close the filehandle | |
fd.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment