Simple automated data cleaning in BigQuery

WITH caps_ as (
SELECT
approx_quantiles(m, 100)[offset(99)] as max_cap,
approx_quantiles(m, 100)[offset(1)] as min_cap
FROM my_table
)
SELECT *,
greatest((select min_cap from caps_),
least((select max_cap from caps_), m)) as m_capped
FROM my_table
CALL gislib.sp.CreateCappedView(
'my_project.views.table_capped', -- view name to create
'my_project.dataset.table', -- base table name
['amount', 'coolness', 'value'] -- list of measures
);
CALL gislib.sp.CreateAutoCappedView(
'my_project.views.table_capped', -- view name to create
'my_project.dataset.table', -- base table name
);

Internals

CREATE OR REPLACE FUNCTION gislib.sp.RepeatPattern(
pattern STRING, x STRING, items ARRAY<STRING>)
AS ((
SELECT STRING_AGG(REPLACE(pattern, x, i), ', \n')
FROM UNNEST(items) i
));
CREATE OR REPLACE PROCEDURE `gislib.sp.CreateCappedView`(
view_name STRING,
table_name STRING,
measures ARRAY<STRING>)
BEGIN
EXECUTE IMMEDIATE
'CREATE OR REPLACE VIEW `' || view_name || '` AS \n'
|| 'WITH caps_ as (SELECT \n'
|| gislib.sp.RepeatPattern('approx_quantiles($, 100)[offset(99)] as $_max_cap_', '$', measures)
|| ', \n'
|| gislib.sp.RepeatPattern('approx_quantiles($, 100)[offset(1)] as $_min_cap_', '$', measures)
|| ' FROM `' || table_name || '` )\n'
|| ' SELECT *, \n'
|| gislib.sp.RepeatPattern('greatest((select $_min_cap_ from caps_), least((select $_max_cap_ from caps_), $)) as $_capped', '$', measures)
|| ' FROM `' || table_name || '`';
END;
CREATE OR REPLACE PROCEDURE gislib.sp.CreateAutoCappedView(view STRING, schema_table STRING)
BEGIN
DECLARE project STRING DEFAULT SPLIT(schema_table, '.')[offset(0)];
DECLARE schema STRING DEFAULT SPLIT(schema_table, '.')[offset(1)];
DECLARE table STRING DEFAULT SPLIT(schema_table, '.')[offset(2)];
DECLARE sql STRING DEFAULT
"SELECT ARRAY( SELECT column_name FROM `" || project || '.' || schema || "`.INFORMATION_SCHEMA.COLUMNS " ||
"WHERE table_name = '" || table || "' AND data_type IN ('FLOAT64', 'NUMERIC', 'INT64') );";
DECLARE columns ARRAY<STRING>;
EXECUTE IMMEDIATE sql INTO columns;

CALL gislib.sp.CreateCappedView(view, schema_table, columns);
END;

--

--

--

Hi, I'm TL of BigQuery Geospatial project. Posting small recipes and various notes for BQ Geospatial users.

Love podcasts or audiobooks? Learn on the go with our new app.

Recommended from Medium

Living Life One Roll At A Time

Examining Dog Registrations in NYC

What does it mean to be data driven?

Cape Python: Apply Privacy-Enhancing Techniques to Protect Sensitive Data in Pandas and Spark

Organizing Your First “Text Analytics” Project

Life is a journey of twists and turns, peaks and valleys, mountains to climb and oceans to explore.

What 10,000 tweets tell you about the French election

How to Pick the Right Notebook for Data Science

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Michael Entin

Michael Entin

Hi, I'm TL of BigQuery Geospatial project. Posting small recipes and various notes for BQ Geospatial users.

More from Medium

Materialized views in BigQuery

An introduction to Dynamic SQL in BigQuery

The easiest way to load a CSV into Google BigQuery

How to Pivot in Google BigQuery