Here is the record layout of the table (load_history) I am trying to use filter on using Standard Sql (because legacy sql will probably become obsolete at some point):
[
{
"mode": "NULLABLE",
"name": "Job",
"type": "RECORD",
"fields": [
{
"mode": "NULLABLE",
"name": "name",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "start_time",
"type": "TIMESTAMP"
},
{
"mode": "NULLABLE",
"name": "end_time",
"type": "TIMESTAMP"
},
{
]
},
{
"mode": "REPEATED",
"name": "source",
"type": "RECORD",
"description": "source tables touched by this job",
"fields": [
{
"mode": "NULLABLE",
"name": "database",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "schema",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "table",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "partition_time",
"type": "TIMESTAMP"
}
]
}
]
I need to filter and select only records where there is an entry in array "source" whose "schema" & "table" fields match certain values (ex. schema='log' AND table='customer' in the same array entry).
The following works when filtering only on one field in the Struct (schema name) :
select name, array(select x from unnest(schema) as x where x ='log' ), table
from (select job.name , array(select schema from unnest(source)) as schema,
array(select table from unnest(source)) as table
from config.load_history)
However, I can't make it work to filter on combination of fields in the same array entry.
Would appreciate your help
for BigQuery Standard SQL
#standardSQL
SELECT data
FROM data, UNNEST(source) AS s
WHERE (s.schema, s.table) = ('log', 'customer')
or
#standardSQL
SELECT *
FROM data
WHERE EXISTS (
SELECT 1 FROM UNNEST(source) AS s
WHERE (s.schema, s.table) = ('log', 'customer')
)
you can test/play with it with below dummy data
#standardSQL
WITH data AS (
SELECT
STRUCT<name STRING, start_time INT64, end_time INT64>('jobA', 1, 2) AS job,
[STRUCT<database STRING, schema STRING, table STRING, partition_time INT64>
('d1', 's1', 't1', 1),
('d1', 's2', 't2', 2),
('d1', 's3', 't3', 3)
] AS source UNION ALL
SELECT
STRUCT<name STRING, start_time INT64, end_time INT64>('jobB', 1, 2) AS job,
[STRUCT<database STRING, schema STRING, table STRING, partition_time INT64>
('d1', 's1', 't1', 1),
('d2', 's4', 't2', 2),
('d2', 's3', 't3', 3)
] AS source
)
SELECT *
FROM data
WHERE EXISTS (
SELECT 1 FROM UNNEST(source) AS s
WHERE (s.schema, s.table) = ('s2', 't2')
)
It sounds like you want something like this:
SELECT
job.name,
ARRAY(SELECT schema FROM UNNEST(matching_sources)) AS matching_schemas,
ARRAY(SELECT table FROM UNNEST(matching_sources)) AS matching_tables
FROM (
SELECT *,
ARRAY(SELECT AS STRUCT * FROM UNNEST(sources)
WHERE schema = 'log' AND `table` = 'customer') AS matching_sources
FROM YourTable
)
WHERE ARRAY_LENGTH(matching_sources) > 0;
This would return an array of schemas and an array of tables, where both match the condition, and exclude rows where no entry in the array matches the condition.
I need to filter and select only records where there is an entry in array "source" whose "schema" & "table" fields match certain values
This sounds like it could be solved with a simple WHERE
clause, like so:
WITH data AS(
select STRUCT<name STRING, start_time TIMESTAMP, end_time TIMESTAMP> ('job_1', TIMESTAMP("2017-06-10"), TIMESTAMP("2017-06-11")) Job, ARRAY<STRUCT<database STRING, schema STRING, table STRING, partition_time TIMESTAMP> > [STRUCT('database_1', "schema_1", "table_1", TIMESTAMP("2017-06-10")), STRUCT('database_1', "schema_1", "table_2", TIMESTAMP("2017-06-10")), STRUCT('database_1', "schema_3", "table_1", TIMESTAMP("2017-06-10")), STRUCT('database_2', "schema_2", "table_2", TIMESTAMP("2017-06-10"))] source union all
select STRUCT<name STRING, start_time TIMESTAMP, end_time TIMESTAMP> ('job_2', TIMESTAMP("2017-06-10"), TIMESTAMP("2017-06-11")) Job, ARRAY<STRUCT<database STRING, schema STRING, table STRING, partition_time TIMESTAMP> > [STRUCT('database_2', "schema_2", "table_2", TIMESTAMP("2017-06-10")), STRUCT('database_2', "schema_2", "table_3", TIMESTAMP("2017-06-10")), STRUCT('database_1', "schema_1", "table_3", TIMESTAMP("2017-06-10"))] source
)
SELECT
*
FROM data
WHERE EXISTS(SELECT 1 FROM UNNEST(source) WHERE schema = "schema_2" AND table = "table_2")
This will return all rows where, at some point, had a given schema and a given table.
If you want to filter out in the output only the records with the matched filters, you could also run this:
SELECT
job.*,
ARRAY(SELECT AS STRUCT database, schema, table, partition_time FROM UNNEST(source) WHERE schema = "schema_2" AND table = "table_2") filtered_data
FROM data
WHERE EXISTS(SELECT 1 FROM UNNEST(source) WHERE schema = "schema_2" AND table = "table_2")
Not sure if this is exactly what you want in your question but it might give you an idea on how to filter out values from ARRAYs.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With