Simple automated data cleaning in BigQuery

WITH caps_ as (
SELECT
approx_quantiles(m, 100)[offset(99)] as max_cap,
approx_quantiles(m, 100)[offset(1)] as min_cap
FROM my_table
)
SELECT *,
greatest((select min_cap from caps_),
least((select max_cap from caps_), m)) as m_capped
FROM my_table
CALL gislib.sp.CreateCappedView(
'my_project.views.table_capped', -- view name to create
'my_project.dataset.table', -- base table name
['amount', 'coolness', 'value'] -- list of measures
);
CALL gislib.sp.CreateAutoCappedView(
'my_project.views.table_capped', -- view name to create
'my_project.dataset.table', -- base table name
);

Internals

Here is the script to create the text of our first stored procedure:

CREATE OR REPLACE FUNCTION gislib.sp.RepeatPattern(
pattern STRING, x STRING, items ARRAY<STRING>)
AS ((
SELECT STRING_AGG(REPLACE(pattern, x, i), ', \n')
FROM UNNEST(items) i
));
CREATE OR REPLACE PROCEDURE `gislib.sp.CreateCappedView`(
view_name STRING,
table_name STRING,
measures ARRAY<STRING>)
BEGIN
EXECUTE IMMEDIATE
'CREATE OR REPLACE VIEW `' || view_name || '` AS \n'
|| 'WITH caps_ as (SELECT \n'
|| gislib.sp.RepeatPattern('approx_quantiles($, 100)[offset(99)] as $_max_cap_', '$', measures)
|| ', \n'
|| gislib.sp.RepeatPattern('approx_quantiles($, 100)[offset(1)] as $_min_cap_', '$', measures)
|| ' FROM `' || table_name || '` )\n'
|| ' SELECT *, \n'
|| gislib.sp.RepeatPattern('greatest((select $_min_cap_ from caps_), least((select $_max_cap_ from caps_), $)) as $_capped', '$', measures)
|| ' FROM `' || table_name || '`';
END;
CREATE OR REPLACE PROCEDURE gislib.sp.CreateAutoCappedView(view STRING, schema_table STRING)
BEGIN
DECLARE project STRING DEFAULT SPLIT(schema_table, '.')[offset(0)];
DECLARE schema STRING DEFAULT SPLIT(schema_table, '.')[offset(1)];
DECLARE table STRING DEFAULT SPLIT(schema_table, '.')[offset(2)];
DECLARE sql STRING DEFAULT
"SELECT ARRAY( SELECT column_name FROM `" || project || '.' || schema || "`.INFORMATION_SCHEMA.COLUMNS " ||
"WHERE table_name = '" || table || "' AND data_type IN ('FLOAT64', 'NUMERIC', 'INT64') );";
DECLARE columns ARRAY<STRING>;
EXECUTE IMMEDIATE sql INTO columns;

CALL gislib.sp.CreateCappedView(view, schema_table, columns);
END;

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Michael Entin

Michael Entin

151 Followers

Hi, I'm TL of BigQuery Geospatial project. Posting small recipes and various notes for BQ Geospatial users.