Skip to content

Instantly share code, notes, and snippets.

@clinejj
Last active May 12, 2023 14:40
Show Gist options
  • Save clinejj/a5917f55667b3d42e11e7485821c7ac9 to your computer and use it in GitHub Desktop.
Save clinejj/a5917f55667b3d42e11e7485821c7ac9 to your computer and use it in GitHub Desktop.
Google Sheets script to scrape your Gmail
/* Instructions:
* 1. Copy the spreadsheet at https://docs.google.com/spreadsheets/d/1Phwm-gVZ8rbtULjHTmbvcR5lMEdvhD-RJgw2wOeavOk/edit#gid=1172415592
* 2. Copy the script below into the Script Editor (Tools->Script Editor)
* 3. In the Script Editor, go to Run->Run function, and select "parseEmails". This will walk you through granting permissions and then start parsing emails.
* 4. That's it!
*
* Notes:
* There may be some limiting as to how many emails it can parse at a time, so you may need to run multiple times to capture everything.
* The script is setup to be idempotent, so you can continue to just run parseEmails() and it will ignore anything that's already been processed.
* Sheet2 is used to keep track of what's been processed, and Sheet1 lists the count of sent and received emails by address.
*/
Array.prototype.findIndex = function(search){
  if(search == "") return false;
  for (var i = 0; i < this.length; i++)
    if (this[i].toString().toLowerCase().trim() == search) return i;
  return -1;
}
function parseEmails() {
  var data = {};
  var quotes = new RegExp('".*?"', 'gi');
 
  var palace = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Sheet2");
  var processedRows = palace.getLastRow() - 5;
  var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Sheet1");
 
  var start = processedRows > 0 ? processedRows : 0;
  Logger.log("Processing from row " + start);
 
  var threads = GmailApp.getInboxThreads(start, 100);
 
  while (threads.length > 0) {
    Logger.log('Threads count: ' + threads.length);
   
    for (var i = 0; i < threads.length; i++) {
      var thread = threads[i];
      var messages = thread.getMessages();
      var messageIds = '';
     
      var threadInfo = getThreadInfo(thread.getId());
      if (threadInfo.threadExists) {
        if (threadInfo.messageCount == messages.length) {
          Logger.log("Thread: " + thread.getId() + " already processed.");
          continue;
        }
       
        messageIds = threadInfo.messages;
      }
     
     
      Logger.log("Thread: " + thread.getId() + ", messages: " + messages.length);
     
      for (var j = 0; j < messages.length; j++) {
        var message = messages[j];
       
        if (messageIds.indexOf(message.getId()) >= 0) {
          continue;
        } else {
          messageIds = messageIds + message.getId() + ',';
        }
       
        // get email from
        var from = message.getFrom();
        if (from.indexOf('<') >= 0) {
          from = from.split('<')[1].split('>')[0];
        }
        from = from.trim().toLowerCase();
        if (data[from]) {
          data[from]['sends'] = data[from]['sends'] + 1;
        } else {
          data[from] = {};
          data[from]['sends'] = 1;
          data[from]['receives'] = 0;
        }
       
        // get all email to
        var to = message.getTo() + "," + message.getCc() + "," + message.getBcc();
        to = to.replace('undisclosed-recipients:;', '').replace(quotes, '').trim();
        var toSplit = to.split(',');
        for (var index in toSplit) {
          var email = toSplit[index];
          if (email == '' || typeof email === 'function') {
            continue;
          }
          email = email.trim().toLowerCase();
          if (email.indexOf('<') >= 0) {
            email = email.split('<')[1].split('>')[0];
          }
          if (data[email]) {
            data[email]['receives'] = data[email]['receives'] + 1;
          } else {
            data[email] = {};
            data[email]['sends'] = 0;
            data[email]['receives'] = 1;
          }
        }
       
        // Logger.log("Email from " + from + " to " + to);
      }
     
      updateThread(thread.getId(), messages.length, messageIds);
     
      for (var email in data) {
        updateEmail(email, data[email]['sends'], data[email]['receives']);
      }
     
      Logger.log("Updated emails for thread.");
     
      data = {};
    }
   
    start = start + threads.length;
  }
}
function getThreadInfo(threadId) {
  var PALACE_THREAD_COLUMN = 1;
  var PALACE_MESSAGE_COUNT_COLUMN = 2;
  var PALACE_MESSAGES_COLUMN = 3;
  var PALACE_DATA_ROW_START = 2;
 
  var returnValue = {};
  returnValue['threadExists'] = false;
 
  var palace = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Sheet2");
  var threadValues = palace.getRange(PALACE_DATA_ROW_START, PALACE_THREAD_COLUMN, palace.getLastRow()).getValues();
  var threadIndex = threadValues.findIndex(threadId);
 
  if (threadIndex >= 0) {
    var row = PALACE_DATA_ROW_START + threadIndex;
    returnValue['threadExists'] = true;
    returnValue['messageCount'] = parseInt(palace.getRange(row, PALACE_MESSAGE_COUNT_COLUMN).getValue(), 10);
    returnValue['messages'] = palace.getRange(row, PALACE_MESSAGES_COLUMN).getValue();
  }
 
  return returnValue;
}
function updateThread(threadId, messageCount, messages) {
  var PALACE_THREAD_COLUMN = 1;
  var PALACE_MESSAGE_COUNT_COLUMN = 2;
  var PALACE_MESSAGES_COLUMN = 3;
  var PALACE_DATA_ROW_START = 2;
 
  var palace = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Sheet2");
  var columnValues = palace.getRange(PALACE_DATA_ROW_START, PALACE_THREAD_COLUMN, palace.getLastRow()).getValues();
  var searchResult = columnValues.findIndex(threadId);
 
  var row = palace.getLastRow() + 1;
 
  if (searchResult >= 0) {
    row = PALACE_DATA_ROW_START + searchResult;
  } else {
    palace.getRange(row, PALACE_THREAD_COLUMN).setValue(threadId);
  }
 
  palace.getRange(row, PALACE_MESSAGE_COUNT_COLUMN).setValue(messageCount);
  palace.getRange(row, PALACE_MESSAGES_COLUMN).setValue(messages);
 
  Logger.log("Updated thread " + threadId + ": " + messageCount + " messages");
}
function updateEmail(email, sends, receives) {
  var EMAIL_COLUMN = 1;
  var SEND_COLUMN = 2;
  var RECEIVE_COLUMN = 3;
  var DATA_ROW_START = 2;
 
  var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Sheet1");
  var columnValues = sheet.getRange(DATA_ROW_START, EMAIL_COLUMN, sheet.getLastRow()).getValues();
  var searchResult = columnValues.findIndex(email);
  var row = sheet.getLastRow() + 1;
 
  if (searchResult >= 0) {
    // email exists
    row = DATA_ROW_START + searchResult;
  } else {
    // email doesn't exist, append
    sheet.getRange(row, EMAIL_COLUMN).setValue(email);
  }
 
  var originalSends = sheet.getRange(row, SEND_COLUMN).getValue();
  var originalReceives = sheet.getRange(row, RECEIVE_COLUMN).getValue();
  sheet.getRange(row, SEND_COLUMN).setValue(originalSends + sends);
  sheet.getRange(row, RECEIVE_COLUMN).setValue(originalReceives + receives);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment