Skip to content

Instantly share code, notes, and snippets.

@nntrn
Last active January 11, 2022 08:36
Show Gist options
  • Save nntrn/0ae9208e556a85127d4db6e350a405ed to your computer and use it in GitHub Desktop.
Save nntrn/0ae9208e556a85127d4db6e350a405ed to your computer and use it in GitHub Desktop.
Extended summary for aggregating data @datalib
var dl = require('datalib')
const getSummary = {
maxValuesToInclude: 25,
config: {
maxValuesToInclude: 25,
showPercent: true,
},
keys: function (x) {
var keys = []
for(var k in x) keys.push(k)
return keys
},
getDistinct: function (p) {
return {
summaryType: 'distinct',
distinct: p.distinct,
valid: p.valid,
}
},
getQuantitative: function (p, options) {
const config = {
fmt: function (n) { return n },
...options
}
const fmtFn = config.fmt
return {
summaryType: 'quantitative',
distinct: fmtFn(p.distinct),
valid: fmtFn(p.valid),
missing: fmtFn(p.missing),
min: fmtFn(p.min),
max: fmtFn(p.max),
median: fmtFn(p.median),
mean: fmtFn(p.mean),
stdev: fmtFn(p.stdev),
// modeskew: fmtFn(p.modeskew),
// ...this.getCategorical(p, options),
}
},
getCategorical: function (p, options) {
var config = {
maxValuesToInclude: this.maxValuesToInclude,
showPercent: true,
...options
}
var u = p.unique
var top = this.keys(u)
.sort((a, b) => u[b] - u[a])
.map((v) => {
const item = {
value: v || '(blank)',
count: u[v]
}
if(config.showPercent) {
Object.assign(item, { '%': +((u[v] / p.count) * 100).toFixed(3).replace(/\.?0+$/, '') })
}
return item
})
.filter(a => a.value.toString().length !== '')
.sort((a, b) => b.count - a.count)
// lump items with only 1 count into 'others'
if(top.length > config.maxValuesToInclude) {
var others = top.slice(config.maxValuesToInclude)
var findOne = top.map(e => e.count).indexOf(1)
if(findOne > -1) {
others = top.splice(findOne)
}
const otherCount = others
.map(e => e.count)
.reduce((a, b) => a + b)
top.push({
value: 'other',
count: otherCount,
pct: +((otherCount / p.count) * 100).toFixed(3).replace(/\.?0+$/, '')
})
}
return {
summaryType: 'categorical',
valid: p.valid,
distinct: p.distinct,
topValues: top
}
}
}
function formatSummaryObj(s, options) {
s = s ? (s.__summary__ ? s : dl.summary(s)) : this
var arr = []
s.forEach((fobj, idx) => {
const flag = (s[idx].valid === s[idx].distinct && 'getDistinct')
|| (s[idx].distinct / s[idx].valid > .85 && 'getDistinct')
|| (s[idx].type === 'number' && 'getQuantitative')
|| 'getCategorical'
arr.push({
name: s[idx].field,
dataType: s[idx].type,
...getSummary[flag](s[idx], options)
})
})
return arr
}
module.exports = formatSummaryObj

Usage

const dl = require('datalib')
const fs = require('fs')
const formatSummaryObj = require('./datalib-extended-summary.js')

const csv = dl.csv('https://raw.githubusercontent.com/hadley/babynames/master/data-raw/lifetables_sample.csv')
const summary = formatSummaryObj(csv)

fs.writeFileSync('./summary.json', JSON.stringify(summary, null, 2))

Result

[
  {
    "name": "x",
    "dataType": "integer",
    "summaryType": "categorical",
    "valid": 28,
    "distinct": 6,
    "topValues": [
      {
        "value": "39",
        "count": 5,
        "%": 17.857
      },
      {
        "value": "59",
        "count": 5,
        "%": 17.857
      },
      {
        "value": "79",
        "count": 5,
        "%": 17.857
      },
      {
        "value": "99",
        "count": 5,
        "%": 17.857
      },
      {
        "value": "19",
        "count": 4,
        "%": 14.286
      },
      {
        "value": "119",
        "count": 4,
        "%": 14.286
      }
    ]
  },
  {
    "name": "qx",
    "dataType": "number",
    "summaryType": "distinct",
    "distinct": 28,
    "valid": 28
  },
  {
    "name": "lx",
    "dataType": "integer",
    "summaryType": "distinct",
    "distinct": 26,
    "valid": 28
  },
  {
    "name": "dx",
    "dataType": "integer",
    "summaryType": "distinct",
    "distinct": 25,
    "valid": 28
  },
  {
    "name": "Lx",
    "dataType": "integer",
    "summaryType": "distinct",
    "distinct": 26,
    "valid": 28
  },
  {
    "name": "Tx",
    "dataType": "integer",
    "summaryType": "distinct",
    "distinct": 26,
    "valid": 28
  },
  {
    "name": "ex",
    "dataType": "number",
    "summaryType": "distinct",
    "distinct": 28,
    "valid": 28
  },
  {
    "name": "sex",
    "dataType": "string",
    "summaryType": "categorical",
    "valid": 28,
    "distinct": 2,
    "topValues": [
      {
        "value": "M",
        "count": 14,
        "%": 50
      },
      {
        "value": "F",
        "count": 14,
        "%": 50
      }
    ]
  },
  {
    "name": "year",
    "dataType": "integer",
    "summaryType": "categorical",
    "valid": 28,
    "distinct": 12,
    "topValues": [
      {
        "value": "1920",
        "count": 3,
        "%": 10.714
      },
      {
        "value": "1940",
        "count": 3,
        "%": 10.714
      },
      {
        "value": "1970",
        "count": 3,
        "%": 10.714
      },
      {
        "value": "1990",
        "count": 3,
        "%": 10.714
      },
      {
        "value": "1900",
        "count": 2,
        "%": 7.143
      },
      {
        "value": "1910",
        "count": 2,
        "%": 7.143
      },
      {
        "value": "1930",
        "count": 2,
        "%": 7.143
      },
      {
        "value": "1950",
        "count": 2,
        "%": 7.143
      },
      {
        "value": "1960",
        "count": 2,
        "%": 7.143
      },
      {
        "value": "1980",
        "count": 2,
        "%": 7.143
      },
      {
        "value": "2000",
        "count": 2,
        "%": 7.143
      },
      {
        "value": "2010",
        "count": 2,
        "%": 7.143
      }
    ]
  }
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment