Last active
May 12, 2023 14:40
-
-
Save clinejj/a5917f55667b3d42e11e7485821c7ac9 to your computer and use it in GitHub Desktop.
Google Sheets script to scrape your Gmail
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Instructions: | |
* 1. Copy the spreadsheet at https://docs.google.com/spreadsheets/d/1Phwm-gVZ8rbtULjHTmbvcR5lMEdvhD-RJgw2wOeavOk/edit#gid=1172415592 | |
* 2. Copy the script below into the Script Editor (Tools->Script Editor) | |
* 3. In the Script Editor, go to Run->Run function, and select "parseEmails". This will walk you through granting permissions and then start parsing emails. | |
* 4. That's it! | |
* | |
* Notes: | |
* There may be some limiting as to how many emails it can parse at a time, so you may need to run multiple times to capture everything. | |
* The script is setup to be idempotent, so you can continue to just run parseEmails() and it will ignore anything that's already been processed. | |
* Sheet2 is used to keep track of what's been processed, and Sheet1 lists the count of sent and received emails by address. | |
*/ | |
Array.prototype.findIndex = function(search){ | |
if(search == "") return false; | |
for (var i = 0; i < this.length; i++) | |
if (this[i].toString().toLowerCase().trim() == search) return i; | |
return -1; | |
} | |
function parseEmails() { | |
var data = {}; | |
var quotes = new RegExp('".*?"', 'gi'); | |
var palace = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Sheet2"); | |
var processedRows = palace.getLastRow() - 5; | |
var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Sheet1"); | |
var start = processedRows > 0 ? processedRows : 0; | |
Logger.log("Processing from row " + start); | |
var threads = GmailApp.getInboxThreads(start, 100); | |
while (threads.length > 0) { | |
Logger.log('Threads count: ' + threads.length); | |
for (var i = 0; i < threads.length; i++) { | |
var thread = threads[i]; | |
var messages = thread.getMessages(); | |
var messageIds = ''; | |
var threadInfo = getThreadInfo(thread.getId()); | |
if (threadInfo.threadExists) { | |
if (threadInfo.messageCount == messages.length) { | |
Logger.log("Thread: " + thread.getId() + " already processed."); | |
continue; | |
} | |
messageIds = threadInfo.messages; | |
} | |
Logger.log("Thread: " + thread.getId() + ", messages: " + messages.length); | |
for (var j = 0; j < messages.length; j++) { | |
var message = messages[j]; | |
if (messageIds.indexOf(message.getId()) >= 0) { | |
continue; | |
} else { | |
messageIds = messageIds + message.getId() + ','; | |
} | |
// get email from | |
var from = message.getFrom(); | |
if (from.indexOf('<') >= 0) { | |
from = from.split('<')[1].split('>')[0]; | |
} | |
from = from.trim().toLowerCase(); | |
if (data[from]) { | |
data[from]['sends'] = data[from]['sends'] + 1; | |
} else { | |
data[from] = {}; | |
data[from]['sends'] = 1; | |
data[from]['receives'] = 0; | |
} | |
// get all email to | |
var to = message.getTo() + "," + message.getCc() + "," + message.getBcc(); | |
to = to.replace('undisclosed-recipients:;', '').replace(quotes, '').trim(); | |
var toSplit = to.split(','); | |
for (var index in toSplit) { | |
var email = toSplit[index]; | |
if (email == '' || typeof email === 'function') { | |
continue; | |
} | |
email = email.trim().toLowerCase(); | |
if (email.indexOf('<') >= 0) { | |
email = email.split('<')[1].split('>')[0]; | |
} | |
if (data[email]) { | |
data[email]['receives'] = data[email]['receives'] + 1; | |
} else { | |
data[email] = {}; | |
data[email]['sends'] = 0; | |
data[email]['receives'] = 1; | |
} | |
} | |
// Logger.log("Email from " + from + " to " + to); | |
} | |
updateThread(thread.getId(), messages.length, messageIds); | |
for (var email in data) { | |
updateEmail(email, data[email]['sends'], data[email]['receives']); | |
} | |
Logger.log("Updated emails for thread."); | |
data = {}; | |
} | |
start = start + threads.length; | |
} | |
} | |
function getThreadInfo(threadId) { | |
var PALACE_THREAD_COLUMN = 1; | |
var PALACE_MESSAGE_COUNT_COLUMN = 2; | |
var PALACE_MESSAGES_COLUMN = 3; | |
var PALACE_DATA_ROW_START = 2; | |
var returnValue = {}; | |
returnValue['threadExists'] = false; | |
var palace = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Sheet2"); | |
var threadValues = palace.getRange(PALACE_DATA_ROW_START, PALACE_THREAD_COLUMN, palace.getLastRow()).getValues(); | |
var threadIndex = threadValues.findIndex(threadId); | |
if (threadIndex >= 0) { | |
var row = PALACE_DATA_ROW_START + threadIndex; | |
returnValue['threadExists'] = true; | |
returnValue['messageCount'] = parseInt(palace.getRange(row, PALACE_MESSAGE_COUNT_COLUMN).getValue(), 10); | |
returnValue['messages'] = palace.getRange(row, PALACE_MESSAGES_COLUMN).getValue(); | |
} | |
return returnValue; | |
} | |
function updateThread(threadId, messageCount, messages) { | |
var PALACE_THREAD_COLUMN = 1; | |
var PALACE_MESSAGE_COUNT_COLUMN = 2; | |
var PALACE_MESSAGES_COLUMN = 3; | |
var PALACE_DATA_ROW_START = 2; | |
var palace = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Sheet2"); | |
var columnValues = palace.getRange(PALACE_DATA_ROW_START, PALACE_THREAD_COLUMN, palace.getLastRow()).getValues(); | |
var searchResult = columnValues.findIndex(threadId); | |
var row = palace.getLastRow() + 1; | |
if (searchResult >= 0) { | |
row = PALACE_DATA_ROW_START + searchResult; | |
} else { | |
palace.getRange(row, PALACE_THREAD_COLUMN).setValue(threadId); | |
} | |
palace.getRange(row, PALACE_MESSAGE_COUNT_COLUMN).setValue(messageCount); | |
palace.getRange(row, PALACE_MESSAGES_COLUMN).setValue(messages); | |
Logger.log("Updated thread " + threadId + ": " + messageCount + " messages"); | |
} | |
function updateEmail(email, sends, receives) { | |
var EMAIL_COLUMN = 1; | |
var SEND_COLUMN = 2; | |
var RECEIVE_COLUMN = 3; | |
var DATA_ROW_START = 2; | |
var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Sheet1"); | |
var columnValues = sheet.getRange(DATA_ROW_START, EMAIL_COLUMN, sheet.getLastRow()).getValues(); | |
var searchResult = columnValues.findIndex(email); | |
var row = sheet.getLastRow() + 1; | |
if (searchResult >= 0) { | |
// email exists | |
row = DATA_ROW_START + searchResult; | |
} else { | |
// email doesn't exist, append | |
sheet.getRange(row, EMAIL_COLUMN).setValue(email); | |
} | |
var originalSends = sheet.getRange(row, SEND_COLUMN).getValue(); | |
var originalReceives = sheet.getRange(row, RECEIVE_COLUMN).getValue(); | |
sheet.getRange(row, SEND_COLUMN).setValue(originalSends + sends); | |
sheet.getRange(row, RECEIVE_COLUMN).setValue(originalReceives + receives); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment