Skip to content

Instantly share code, notes, and snippets.

# proof of concept probability counting implementation
set.seed(123456789)
n <- 16
probabilisticCounting = function(stream) {
bits <- rep(0, n)
for (s in stream) {
bits[p(binary(hash(s)))] <- 1
library(quantmod)
getSymbols(c("^VIX"))
vix = AAPL[,"VIX.Adjusted"]
vix.hash = apply(vix, 2, function(col) sapply(col, digest, algo="md5"))
vix.hash = apply(vix.hash, 2, function(col) { as.numeric(paste("0x", col, sep="")) / 15e+38 } )
old.par <- par(mfrow=c(1, 2))
plot(vix, ylim=c(10, 120), ylab='')
plot(vix.hash, ylim=c(0, 1), ylab='')
par(old.par)
@dataminelab
dataminelab / unique_visitors.mondrian.xml
Created August 20, 2011 16:17
Unique visitors cube
<Schema name="UNIQUE_VISITORS">
<Cube name="UNIQUE_VISITORS" cache="true" enabled="true">
<Table name="SAMPLE_UNIQUE_VISITORS">
</Table>
<Dimension foreignKey="day" name="Date">
<Hierarchy hasAll="true" primaryKey="day_date">
<Table name="DIM_DATE">
</Table>
<Level name="Year" column="year_number" type="Numeric" uniqueMembers="false" levelType="TimeYears" hideMemberIf="Never">
@dataminelab
dataminelab / select_distinct.sql
Created August 20, 2011 15:36
Selecting unique visitors in Hive
SELECT COUNT(DISTINCT user_id) FROM page_views
@dataminelab
dataminelab / hive_md5.java
Created June 27, 2011 22:24
Hive MD5 UDF
package com.dataminelab.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import java.security.*;
/**
* Calculate md5 of the string
*/
@dataminelab
dataminelab / unique_visitors.ql
Created June 27, 2011 21:42
Unique visitors HiveQL
ADD JAR ${CODE}/hive/udf/md5.jar;
CREATE TEMPORARY FUNCTION mymd5 AS 'com.dataminelab.hive.udf.Md5';
SELECT
DISTINCT mymd5(user_id)
FROM page_views
WHERE
substr(mymd5(user_id), -2) == '00';