Simple automated data cleaning in BigQuery

WITH caps_ as (
SELECT
approx_quantiles(m, 100)[offset(99)] as max_cap,
approx_quantiles(m, 100)[offset(1)] as min_cap
FROM my_table
)
SELECT *,
greatest((select min_cap from caps_),
least((select max_cap from caps_), m)) as m_capped
FROM my_table
CALL gislib.sp.CreateCappedView(
'my_project.views.table_capped', -- view name to create
'my_project.dataset.table', -- base table name
['amount', 'coolness', 'value'] -- list of measures
);
CALL gislib.sp.CreateAutoCappedView(
'my_project.views.table_capped', -- view name to create
'my_project.dataset.table', -- base table name
);

Internals

CREATE OR REPLACE FUNCTION gislib.sp.RepeatPattern(
pattern STRING, x STRING, items ARRAY<STRING>)
AS ((
SELECT STRING_AGG(REPLACE(pattern, x, i), ', \n')
FROM UNNEST(items) i
));
CREATE OR REPLACE PROCEDURE `gislib.sp.CreateCappedView`(
view_name STRING,
table_name STRING,
measures ARRAY<STRING>)
BEGIN
EXECUTE IMMEDIATE
'CREATE OR REPLACE VIEW `' || view_name || '` AS \n'
|| 'WITH caps_ as (SELECT \n'
|| gislib.sp.RepeatPattern('approx_quantiles($, 100)[offset(99)] as $_max_cap_', '$', measures)
|| ', \n'
|| gislib.sp.RepeatPattern('approx_quantiles($, 100)[offset(1)] as $_min_cap_', '$', measures)
|| ' FROM `' || table_name || '` )\n'
|| ' SELECT *, \n'
|| gislib.sp.RepeatPattern('greatest((select $_min_cap_ from caps_), least((select $_max_cap_ from caps_), $)) as $_capped', '$', measures)
|| ' FROM `' || table_name || '`';
END;
CREATE OR REPLACE PROCEDURE gislib.sp.CreateAutoCappedView(view STRING, schema_table STRING)
BEGIN
DECLARE project STRING DEFAULT SPLIT(schema_table, '.')[offset(0)];
DECLARE schema STRING DEFAULT SPLIT(schema_table, '.')[offset(1)];
DECLARE table STRING DEFAULT SPLIT(schema_table, '.')[offset(2)];
DECLARE sql STRING DEFAULT
"SELECT ARRAY( SELECT column_name FROM `" || project || '.' || schema || "`.INFORMATION_SCHEMA.COLUMNS " ||
"WHERE table_name = '" || table || "' AND data_type IN ('FLOAT64', 'NUMERIC', 'INT64') );";
DECLARE columns ARRAY<STRING>;
EXECUTE IMMEDIATE sql INTO columns;

CALL gislib.sp.CreateCappedView(view, schema_table, columns);
END;

--

--

--

Hi, I'm TL of BigQuery Geospatial project. Posting small recipes and various notes for BQ Geospatial users.

Love podcasts or audiobooks? Learn on the go with our new app.

Recommended from Medium

DLL Injection & DLLInjector

Apache Hop, environments and configurations. A better setup.

nCine Dev Update 12

Network Automation with Python

From local code to public container registry

WSL enable systemd (systemctl command)

Introduction to Entity storage for Games using ECS (or DCS)

Reaching Points

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Michael Entin

Michael Entin

Hi, I'm TL of BigQuery Geospatial project. Posting small recipes and various notes for BQ Geospatial users.

More from Medium

BigQuery: Importing your data with SQL from Google cloud and drive storage

Evaluate arithmetic expressions without values using BigQuery

Optimize costs in BigQuery — 9 solutions

The easiest way to load a CSV into Google BigQuery