Last active
July 12, 2021 20:08
-
-
Save NeKzor/ef166f9d7e48690dabcd712f54f9d1b1 to your computer and use it in GitHub Desktop.
Every Mario Kart World Record form 1996-2021. Datasets available at https://www.dolthub.com/repositories/nekz/mkwrs/data/master
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fetch = require('node-fetch'); | |
const fs = require('fs'); | |
const { JSDOM } = require('jsdom'); | |
const path = require('path'); | |
const baseApi = 'https://mkwrs.com'; | |
const fetchOptions = { | |
headers: { | |
'User-Agent': 'ne^', | |
}, | |
}; | |
const games = [ | |
'mk8dx', | |
'mk8', | |
'mk7', | |
'mkwii', | |
'mkds', | |
'mkdd', | |
'mksc', | |
'mk64', | |
'smk', | |
]; | |
const findMostColumns = (rows) => { | |
let [most] = rows; | |
let count = Object.keys(most).length; | |
rows.forEach((row) => { | |
const newCount = Object.keys(row).length; | |
if (newCount > count) { | |
count = newCount; | |
most = row; | |
} | |
}); | |
return most; | |
}; | |
const refresh = process.argv.slice(2).some((arg) => arg === '--refresh'); | |
const main = async () => { | |
const importScript = path.join(__dirname, '/../import.sh'); | |
const dataFolder = path.join(__dirname, '/../data'); | |
fs.writeFileSync(importScript, '#!/bin/bash\n', 'utf-8'); | |
for (const game of games) { | |
const dataFile = path.join(dataFolder, `/${game}.json`); | |
let rows = []; | |
if (refresh) { | |
const tracks = await scrapeTracks(game); | |
rows = await scrapeGame(game, tracks); | |
fs.writeFileSync(dataFile, JSON.stringify({ rows }, null, 4), 'utf-8'); | |
} else { | |
console.log('importing', game); | |
rows = JSON.parse(fs.readFileSync(dataFile, 'utf-8')).rows; | |
} | |
fs.appendFileSync( | |
importScript, | |
`\ndolt sql -q 'CREATE TABLE ${game} ( | |
${Object.keys(findMostColumns(rows)).map((column) => ` ${column} varchar(255)`).join(',\n')} | |
)' | |
dolt table import -u ${game} ${dataFile}\n`, | |
'utf-8' | |
); | |
} | |
}; | |
const scrapeTracks = async (game) => { | |
console.log('scraping', game, '...'); | |
const route = `${baseApi}/${game}`; | |
const res = await fetch(route, fetchOptions); | |
console.log('[GET]', route, ':', res.status); | |
const text = await res.text(); | |
const dom = new JSDOM(text); | |
const document = dom.window.document; | |
const tracks = []; | |
const tables = document.querySelectorAll('.wr'); | |
console.log('found', tables.length, 'tables'); | |
let tableMode = false; | |
for (const table of tables) { | |
const rows = [...table.querySelectorAll('tr')] | |
.slice(1) | |
.filter((tr) => tr.parentElement.parentElement.className === 'wr'); | |
console.log('found', rows.length, 'rows'); | |
for (const row of rows) { | |
const track = row.querySelector('td'); | |
if (!track || !track.children[0]) { | |
continue; | |
} | |
if (track.children[0].tagName === 'A' && !tableMode) { | |
const trackA = track.children[0]; | |
const name = trackA.textContent; | |
const link = trackA.getAttribute('href'); | |
if (link.startsWith('http')) { | |
continue; | |
} | |
console.log(name, link); | |
tracks.push({ | |
name, | |
id: link.slice('display.php?track='.length), | |
}); | |
} else if (track.children[0].tagName === 'TABLE') { | |
tableMode = true; | |
if (!(track.children[0].children[0] && track.children[0].children[0].querySelectorAll('td'))) { | |
continue; | |
} | |
if (track.getAttribute('colspan') !== '2') { | |
continue; | |
} | |
const [trackTd, ...tds] = [...track.children[0].children[0].querySelectorAll('td')]; | |
const name = trackTd.textContent; | |
console.log(name); | |
for (const td of tds) { | |
const link = td.firstElementChild.getAttribute('href'); | |
const category = td.firstElementChild.textContent; | |
if (link.startsWith('http')) { | |
continue; | |
} | |
console.log(link); | |
tracks.push({ | |
name, | |
id: link.slice('display.php?track='.length), | |
category, | |
}); | |
} | |
} | |
} | |
} | |
return tracks; | |
}; | |
const scrapeGame = async (gameName, tracks) => { | |
const isMk64 = gameName === 'mk64'; | |
const isMkwii = gameName === 'mkwii'; | |
const columnOffset = isMk64 || isMkwii ? 5 : 4; | |
console.log('scraping tracks for', gameName, '...'); | |
const result = []; | |
for (const track of tracks) { | |
const route = `${baseApi}/${gameName}/display.php?track=${track.id}`; | |
const res = await fetch(route, fetchOptions); | |
console.log('[GET]', route, ':', res.status); | |
const text = await res.text(); | |
const dom = new JSDOM(text); | |
const document = dom.window.document; | |
const [columnRow, ...rows] = [...document.querySelectorAll('.wr')[1].querySelectorAll('tr')]; | |
const columnNames = [...columnRow.querySelectorAll('th')].map((th) => th.textContent) | |
.slice(columnOffset) | |
.map((column) => column.replace(/ /g, '_').toLowerCase()); | |
const isMk8DxGcnBabyPark = columnNames[columnNames.length - 1] === 'combination'; | |
let insertSchroomsAndCombination = false; | |
let record = null; | |
for (const row of rows) { | |
const allTds = [...row.querySelectorAll('td')]; | |
if (allTds.length <= 2) { | |
continue; | |
} | |
if (isMk8DxGcnBabyPark) { | |
if (insertSchroomsAndCombination) { | |
insertSchroomsAndCombination = false; | |
record.shrooms = allTds[0].textContent; | |
record.tires = allTds[1].textContent; | |
record.glider = allTds[2].textContent; | |
continue; | |
} else { | |
insertSchroomsAndCombination = true; | |
const tds = allTds.slice(columnOffset); | |
record = columnNames.slice(0, 7).reduce((record, column, idx) => { | |
record[column] = tds[idx].textContent; | |
return record; | |
}, {}); | |
record.coins = tds[8].textContent; | |
record.character = tds[9].textContent; | |
record.kart = tds[10].textContent; | |
} | |
} else { | |
const tds = allTds.slice(columnOffset); | |
record = columnNames.reduce((record, column, idx) => { | |
record[column] = tds[idx].textContent; | |
return record; | |
}, {}); | |
} | |
if (isMk64) { | |
const [dateTd, ntscTimeTd, palTimeTd, playerTd, nationTd] = allTds; | |
record.date = dateTd.firstElementChild ? dateTd.firstElementChild.textContent : dateTd.textContent; | |
record.note = dateTd.firstElementChild ? dateTd.firstElementChild.getAttribute('title') : null; | |
record.ntscTime = ntscTimeTd.firstElementChild ? ntscTimeTd.firstElementChild.textContent : ntscTimeTd.textContent; | |
record.ntscVideo = ntscTimeTd.firstElementChild ? ntscTimeTd.firstElementChild.getAttribute('href') : null; | |
record.palTime = palTimeTd.firstElementChild ? palTimeTd.firstElementChild.textContent : palTimeTd.textContent; | |
record.palVideo = palTimeTd.firstElementChild ? palTimeTd.firstElementChild.getAttribute('href') : null; | |
record.player_id = playerTd.firstElementChild ? playerTd.firstElementChild.getAttribute('href').slice(19) : null; | |
record.player_name = playerTd.firstElementChild ? playerTd.firstElementChild.textContent : playerTd.textContent; | |
record.player_nation = nationTd.firstElementChild && nationTd.firstElementChild.firstElementChild | |
? nationTd.firstElementChild.firstElementChild.getAttribute('title') | |
: null; | |
} else if (isMkwii) { | |
const [dateTd, timeTd, playerTd, miiNameTd, nationTd] = allTds; | |
const ghostTd = allTds[allTds.length - 1]; | |
record.date = dateTd.firstElementChild ? dateTd.firstElementChild.textContent : dateTd.textContent; | |
record.note = dateTd.firstElementChild ? dateTd.firstElementChild.getAttribute('title') : null; | |
record.time = timeTd.firstElementChild ? timeTd.firstElementChild.textContent : timeTd.textContent; | |
record.video = timeTd.firstElementChild ? timeTd.firstElementChild.getAttribute('href') : null; | |
record.ghost = ghostTd.firstElementChild ? ghostTd.firstElementChild.getAttribute('href') : null; | |
record.player_id = playerTd.firstElementChild ? playerTd.firstElementChild.getAttribute('href').slice(19) : null; | |
record.player_name = playerTd.firstElementChild ? playerTd.firstElementChild.textContent : playerTd.textContent; | |
record.player_nation = nationTd.firstElementChild && nationTd.firstElementChild.firstElementChild | |
? nationTd.firstElementChild.firstElementChild.getAttribute('title') | |
: null; | |
record.player_mii = miiNameTd.textContent; | |
} else { | |
const [dateTd, timeTd, playerTd, nationTd] = allTds; | |
record.date = dateTd.firstElementChild ? dateTd.firstElementChild.textContent : dateTd.textContent; | |
record.note = dateTd.firstElementChild ? dateTd.firstElementChild.getAttribute('title') : null; | |
record.time = timeTd.firstElementChild ? timeTd.firstElementChild.textContent : timeTd.textContent; | |
record.video = timeTd.firstElementChild ? timeTd.firstElementChild.getAttribute('href') : null; | |
record.device = timeTd.children[1] ? timeTd.children[1].getAttribute('title') : null; | |
record.player_id = playerTd.firstElementChild ? playerTd.firstElementChild.getAttribute('href').slice(19) : null; | |
record.player_name = playerTd.firstElementChild ? playerTd.firstElementChild.textContent : playerTd.textContent; | |
record.player_nation = nationTd.firstElementChild && nationTd.firstElementChild.firstElementChild | |
? nationTd.firstElementChild.firstElementChild.getAttribute('title') | |
: null; | |
} | |
record.track_id = track.id; | |
record.track_name = track.name; | |
record.track_category = track.category ? track.category : null; | |
result.push(record); | |
} | |
} | |
return result; | |
}; | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment