KevinPayravi/get-wiki-tweets.js

## get-wiki-tweets.js
/*

A console script that can be run on a MediaWiki wiki to extract all tweets linked in the mainspace.
To run this: visit a MediaWiki wiki, open your browser's developer console, paste this code in, and press enter.
It will print and re-print a growing array of Tweets until it is done searching.

How does this janky script work?
It first searches MediaWiki for all pages that contain the text "twitter".
Trying to search for URLs (i.e. twitter.com) is unreliable depending on a wiki's search infrastructure.
This script then grabs the wikitext from all these pages and uses regex to search for any Tweet URLs.
An array of these URLS are then spit out to the console.
As each network call is async, the array will be printed multiple times as it is built.

These Twitter links could then be batch processed on the Wayback Machine;
one option is using Wayback's Google Sheets batch service.
https://archive.org/services/wayback-gsheets/

*/

var api = new mw.Api();
var twitterLinks = [];
const REGEX = /(https?:\/\/(www\.)?twitter.com\/\S*?\/status\/\d+)/g;

function getResults(offset) {
  api.get({
    "action": "query",
    "format": "json",
    "list": "search",
    "utf8": 1,
    "srsearch": "twitter",
    "srlimit": "50",
    "srwhat": "text",
    "srinfo": "",
    "srprop": "",
    "sroffset": offset
  }).done(function(data) {
    if (data.continue) {
      offset = data.continue.sroffset;
      if (offset > 0) {
        getResults(offset);
      }
    }
    var articles = data.query.search[0].title;
    for (var i = 1; i < data.query.search.length; i++) {
      articles = articles + '|' + data.query.search[i].title;
    }
    api.get({
      "action": "query",
      "format": "json",
      "prop": "revisions",
      "titles": articles,
      "formatversion": "2",
      "rvprop": "content",
      "rvslots": "*"
    }).done(function(data) {
      var links = [];
      for (var j = 1; j < data.query.pages.length; j++) {
        var matches = data.query.pages[j].revisions[0].slots.main.content.match(REGEX);
        if (matches) {
          links = links.concat(matches);
        }
      }
      twitterLinks = twitterLinks.concat(links);
      console.log(twitterLinks);
    });
  });
}

getResults(0);
	/*

	A console script that can be run on a MediaWiki wiki to extract all tweets linked in the mainspace.
	To run this: visit a MediaWiki wiki, open your browser's developer console, paste this code in, and press enter.
	It will print and re-print a growing array of Tweets until it is done searching.

	How does this janky script work?
	It first searches MediaWiki for all pages that contain the text "twitter".
	Trying to search for URLs (i.e. twitter.com) is unreliable depending on a wiki's search infrastructure.
	This script then grabs the wikitext from all these pages and uses regex to search for any Tweet URLs.
	An array of these URLS are then spit out to the console.
	As each network call is async, the array will be printed multiple times as it is built.

	These Twitter links could then be batch processed on the Wayback Machine;
	one option is using Wayback's Google Sheets batch service.
	https://archive.org/services/wayback-gsheets/

	*/

	var api = new mw.Api();
	var twitterLinks = [];
	const REGEX = /(https?:\/\/(www\.)?twitter.com\/\S*?\/status\/\d+)/g;

	function getResults(offset) {
	api.get({
	"action": "query",
	"format": "json",
	"list": "search",
	"utf8": 1,
	"srsearch": "twitter",
	"srlimit": "50",
	"srwhat": "text",
	"srinfo": "",
	"srprop": "",
	"sroffset": offset
	}).done(function(data) {
	if (data.continue) {
	offset = data.continue.sroffset;
	if (offset > 0) {
	getResults(offset);
	}
	}
	var articles = data.query.search[0].title;
	for (var i = 1; i < data.query.search.length; i++) {
	articles = articles + '\|' + data.query.search[i].title;
	}
	api.get({
	"action": "query",
	"format": "json",
	"prop": "revisions",
	"titles": articles,
	"formatversion": "2",
	"rvprop": "content",
	"rvslots": "*"
	}).done(function(data) {
	var links = [];
	for (var j = 1; j < data.query.pages.length; j++) {
	var matches = data.query.pages[j].revisions[0].slots.main.content.match(REGEX);
	if (matches) {
	links = links.concat(matches);
	}
	}
	twitterLinks = twitterLinks.concat(links);
	console.log(twitterLinks);
	});
	});
	}

	getResults(0);