RobinL/example.py Secret

## example.py
import duckdb

import splink.duckdb.comparison_level_library as cll
import splink.duckdb.comparison_library as cl
from splink.duckdb.linker import DuckDBLinker

# duckdb sql statement that creates a few rows pertaining to a few people
# using UNION ALL


sql = """
    SELECT '1' as unique_id, 'John' as first_name, ['a','b','c'] as test_array
    UNION ALL
    SELECT '2', 'Jane', ['a','e','f']
    UNION ALL
    SELECT '3', 'Jack', ['a','h','i']
    UNION ALL
    SELECT '4', 'Jill', ['a','k','l']
    UNION ALL
    SELECT '5', 'Joe', ['a','n','o']
    """

df_master = duckdb.sql(sql)
print(df_master)


sql = """
SELECT '5' as unique_id, 'John' as first_name, ['a'] as test_array
UNION ALL
SELECT '6', 'John', ['e']
UNION ALL
SELECT '7', 'Jane', ['e']
"""

df_long = duckdb.sql(sql)
print(df_long)


# Derive custom tf table
sql = """
with exploded as (
select unnest(test_array) as test_array
from df_master
)
select [test_array] as test_array, count(*)/(select count(*) from exploded) as tf_test_array
from exploded
group by test_array
"""
__splink__df_tf_test_array = duckdb.sql(sql).df()
__splink__df_tf_test_array


comparison_test_array = {
    "output_column_name": "test_array",
    "comparison_levels": [
        cll.null_level("test_array"),
        {
            "sql_condition": 'array_length(list_intersect("test_array_l", "test_array_r"))>= 1',
            "label_for_charts": "Arrays intersect",
            "tf_adjustment_column": "test_array",  # This controls which lookup to use, it's unrelated to the exact match level
            "tf_adjustment_weight": 1.0,
            "disable_tf_exact_match_detection": True,  # Don't look for an exact match level, use own level instead
        },
        cll.else_level(),
    ],
    "comparison_description": "arr",
}


settings = {
    "probability_two_random_records_match": 0.01,
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": ["1=1"],  # full cartesian product
    "comparisons": [cl.exact_match("first_name"), comparison_test_array],
    "retain_intermediate_calculation_columns": True,
}

linker = DuckDBLinker(["df_long", "df_master"], settings)
linker.register_term_frequency_lookup(__splink__df_tf_test_array, "test_array")


import logging

logging.getLogger("splink").setLevel(1)
linker.predict().as_pandas_dataframe()
	import duckdb

	import splink.duckdb.comparison_level_library as cll
	import splink.duckdb.comparison_library as cl
	from splink.duckdb.linker import DuckDBLinker

	# duckdb sql statement that creates a few rows pertaining to a few people
	# using UNION ALL


	sql = """
	SELECT '1' as unique_id, 'John' as first_name, ['a','b','c'] as test_array
	UNION ALL
	SELECT '2', 'Jane', ['a','e','f']
	UNION ALL
	SELECT '3', 'Jack', ['a','h','i']
	UNION ALL
	SELECT '4', 'Jill', ['a','k','l']
	UNION ALL
	SELECT '5', 'Joe', ['a','n','o']
	"""

	df_master = duckdb.sql(sql)
	print(df_master)


	sql = """
	SELECT '5' as unique_id, 'John' as first_name, ['a'] as test_array
	UNION ALL
	SELECT '6', 'John', ['e']
	UNION ALL
	SELECT '7', 'Jane', ['e']
	"""

	df_long = duckdb.sql(sql)
	print(df_long)


	# Derive custom tf table
	sql = """
	with exploded as (
	select unnest(test_array) as test_array
	from df_master
	)
	select [test_array] as test_array, count()/(select count() from exploded) as tf_test_array
	from exploded
	group by test_array
	"""
	__splink__df_tf_test_array = duckdb.sql(sql).df()
	__splink__df_tf_test_array


	comparison_test_array = {
	"output_column_name": "test_array",
	"comparison_levels": [
	cll.null_level("test_array"),
	{
	"sql_condition": 'array_length(list_intersect("test_array_l", "test_array_r"))>= 1',
	"label_for_charts": "Arrays intersect",
	"tf_adjustment_column": "test_array", # This controls which lookup to use, it's unrelated to the exact match level
	"tf_adjustment_weight": 1.0,
	"disable_tf_exact_match_detection": True, # Don't look for an exact match level, use own level instead
	},
	cll.else_level(),
	],
	"comparison_description": "arr",
	}


	settings = {
	"probability_two_random_records_match": 0.01,
	"link_type": "link_only",
	"blocking_rules_to_generate_predictions": ["1=1"], # full cartesian product
	"comparisons": [cl.exact_match("first_name"), comparison_test_array],
	"retain_intermediate_calculation_columns": True,
	}

	linker = DuckDBLinker(["df_long", "df_master"], settings)
	linker.register_term_frequency_lookup(__splink__df_tf_test_array, "test_array")


	import logging

	logging.getLogger("splink").setLevel(1)
	linker.predict().as_pandas_dataframe()