From ed5cdeb1f7897dea857416869b78457ecffbb0c4 Mon Sep 17 00:00:00 2001 From: Lak Lakshmanan Date: Tue, 20 Aug 2019 10:24:18 -0700 Subject: [PATCH] script to extract code from Google Docs. --- 01_intro/all_code.txt | 6 + 02_query/all_code.txt | 302 ++++++++++ 03_func/all_code.txt | 298 ++++++++++ 04_load/all_code.txt | 366 ++++++++++++ 05_devel/all_code.txt | 684 +++++++++++++++++++++++ 06_arch/all_code.txt | 90 +++ 07_perf/all_code.txt | 877 +++++++++++++++++++++++++++++ 08_advqueries/all_code.txt | 1071 ++++++++++++++++++++++++++++++++++++ 09_bqml/all_code.txt | 794 ++++++++++++++++++++++++++ 10_securing/all_code.txt | 260 +++++++++ scripts/.gitignore | 2 + scripts/README.md | 2 + scripts/extract_all.sh | 12 + scripts/extract_code.py | 106 ++++ scripts/install.sh | 1 + 15 files changed, 4871 insertions(+) create mode 100644 01_intro/all_code.txt create mode 100644 02_query/all_code.txt create mode 100644 03_func/all_code.txt create mode 100644 04_load/all_code.txt create mode 100644 05_devel/all_code.txt create mode 100644 06_arch/all_code.txt create mode 100644 07_perf/all_code.txt create mode 100644 08_advqueries/all_code.txt create mode 100644 09_bqml/all_code.txt create mode 100644 10_securing/all_code.txt create mode 100644 scripts/.gitignore create mode 100644 scripts/README.md create mode 100755 scripts/extract_all.sh create mode 100755 scripts/extract_code.py create mode 100755 scripts/install.sh diff --git a/01_intro/all_code.txt b/01_intro/all_code.txt new file mode 100644 index 0000000..6158f16 --- /dev/null +++ b/01_intro/all_code.txt @@ -0,0 +1,6 @@ +SELECT EXTRACT(YEAR FROM starttime) AS year, EXTRACT(MONTH FROM starttime) AS month, COUNT(starttime) AS number_one_way FROM mydb.return_transactions WHERE start_station_name != end_station_name GROUP BY year, month ORDER BY year ASC, month ASC + +station_name +SELECT EXTRACT(YEAR FROM starttime) AS year, EXTRACT(MONTH FROM starttime) AS month, COUNT(starttime) AS number_one_way FROM `bigquery-public-data.new_york_citibike.citibike_trips` WHERE start_station_name != end_station_name GROUP BY year, month ORDER BY year ASC, month ASC-- Are there fewer bicycle rentals on rainy days? +WITH bicycle_rentals AS ( SELECT COUNT(starttime) as num_trips, EXTRACT(DATE from starttime) as trip_date FROM `bigquery-public-data.new_york_citibike.citibike_trips` GROUP BY trip_date ), rainy_days AS ( SELECT date, (MAX(prcp) > 5) AS rainy FROM ( SELECT wx.date AS date, IF (wx.element = 'PRCP', wx.value/10, NULL) AS prcp FROM `bigquery-public-data.ghcn_d.ghcnd_2016` AS wx WHERE wx.id = 'USW00094728' ) GROUP BY date ) SELECT ROUND(AVG(bk.num_trips)) AS num_trips, wx.rainy FROM bicycle_rentals AS bk JOIN rainy_days AS wx ON wx.date = bk.trip_date GROUP BY wx.rainyRow num_trips rainy 1 39107.0 false 2 32052.0 true + diff --git a/02_query/all_code.txt b/02_query/all_code.txt new file mode 100644 index 0000000..5eabacd --- /dev/null +++ b/02_query/all_code.txt @@ -0,0 +1,302 @@ +#standardsql#standardsql +SELECT DISTINCT gender FROM `bigquery-public-data`.new_york_citibike.citibike_trips +#standardsql-- simple select SELECT gender, tripduration FROM `bigquery-public-data`.new_york_citibike.citibike_trips LIMIT 5 +gendertripduration +bigquery-public-data.new_york_citibike.citibike_tripsbigquery-public-datanew_york_citibikecitibike_tripsbigquery-public-datanew_yorknew_york_citibikecitibike_tripsbigquery-public-data-- simple select SELECT gender, tripduration FROM `bigquery-public-data.new_york_citibike.citibike_trips` LIMIT 5 +citibike_trips`bigquery-public-data`.new_york_citibike.citibike_trips -- Aliasing column names +SELECT gender, tripduration AS rental_duration FROM `bigquery-public-data`.new_york_citibike.citibike_trips LIMIT 5 + +SELECT gender, tripduration/60 FROM `bigquery-public-data`.new_york_citibike.citibike_trips LIMIT 5 + SELECT gender, tripduration/60 AS duration_minutes FROM `bigquery-public-data`.new_york_citibike.citibike_trips LIMIT 5 + SELECT gender, tripduration FROM `bigquery-public-data`.new_york_citibike.citibike_trips WHERE tripduration < 600 LIMIT 5 + SELECT gender, tripduration FROM `bigquery-public-data`.new_york_citibike.citibike_trips WHERE tripduration >= 300 AND tripduration < 600 AND gender = 'female' LIMIT 5 +WHERE tripduration < 600 AND NOT gender = 'female' +WHERE (tripduration < 600 AND gender = 'female') OR gender = 'male'SELECT gender, tripduration/60 AS minutes FROM `bigquery-public-data`.new_york_citibike.citibike_trips WHERE minutes < 10 -- CAN NOT REFERENCE ALIAS IN WHERE LIMIT 5 + +SELECT + gender, tripduration / 60 AS minutes +FROM + `bigquery-public-data`.new_york_citibike.citibike_trips +WHERE (tripduration / 60) < 10 +LIMIT 5SELECT *SELECT + * +FROM + `bigquery-public-data`.new_york_citibike.citibike_stations +WHERE name LIKE '%Riverside%' +WHERELIKERiversideSELECT EXCEPTSELECT + * EXCEPT(short_name, last_reported) +FROM + `bigquery-public-data`.new_york_citibike.citibike_stations +WHERE name LIKE '%Riverside%' +short_namelast_reportedSELECT + * REPLACE(num_bikes_available + 5 AS num_bikes_available) +FROM + `bigquery-public-data`.new_york_citibike.citibike_stations +SELECT * FROM ( + SELECT + gender, tripduration / 60 AS minutes + FROM + `bigquery-public-data`.new_york_citibike.citibike_trips +) +WHERE minutes < 10 +LIMIT 5 +WITH all_trips AS ( + SELECT + gender, tripduration / 60 AS minutes + FROM + `bigquery-public-data`.new_york_citibike.citibike_trips +) + +SELECT * from all_trips +WHERE minutes < 10 +LIMIT 5 +all_trips +SELECT + gender, tripduration/60 AS minutes +FROM + `bigquery-public-data`.new_york_citibike.citibike_trips +WHERE gender = 'female' +ORDER BY minutes DESC +LIMIT 5 + + +SELECT AVG(tripduration / 60) AS avg_trip_duration FROM `bigquery-public-data`.new_york_citibike.citibike_trips WHERE gender = 'male' + SELECT gender, AVG(tripduration / 60) AS avg_trip_duration FROM `bigquery-public-data`.new_york_citibike.citibike_trips WHERE + tripduration is not NULL +GROUP BY gender +ORDER BY avg_trip_duration + +SELECT + gender, + COUNT(*) AS rides, + AVG(tripduration / 60) AS avg_trip_duration +FROM + `bigquery-public-data`.new_york_citibike.citibike_trips +WHERE + tripduration IS NOT NULL +GROUP BY + gender +ORDER BY + avg_trip_duration + +SELECT + gender, AVG(tripduration / 60) AS avg_trip_duration +FROM + `bigquery-public-data`.new_york_citibike.citibike_trips +WHERE tripduration IS NOT NULL +GROUP BY + gender +HAVING avg_trip_duration > 14 +ORDER BY + avg_trip_duration + +SELECT DISTINCT + gender FROM `bigquery-public-data`.new_york_citibike.citibike_trips +SELECT + bikeid, + tripduration, + gender +FROM + `bigquery-public-data`.new_york_citibike.citibike_trips +WHERE gender = "" +LIMIT 100SELECT DISTINCT + gender, + usertype +FROM + `bigquery-public-data`.new_york_citibike.citibike_trips +WHERE gender != '' +SELECT + city, SPLIT(city, ' ') AS parts +FROM ( + SELECT * from UNNEST([ + 'Seattle WA', 'New York', 'Singapore' + ]) AS city +) + + + +UNION ALLSELECTWITH example AS ( + SELECT 'Sat' AS day, 1451 AS numrides, 1018 AS oneways + UNION ALL SELECT 'Sun', 2376, 936 + UNION ALL SELECT 'Mon', 1476, 736 +) + +SELECT * from example +WHERE numrides < 2000 +SELECT + gender + , EXTRACT(YEAR from starttime) AS year -- + , COUNT(*) AS numtrips +FROM + `bigquery-public-data`.new_york_citibike.citibike_trips +WHERE gender != 'unknown' and starttime IS NOT NULL +GROUP BY gender, year +HAVING year > 2016 +SELECT + gender + , EXTRACT(YEAR from starttime) AS year + -- comment out this line , COUNT(1) AS numtrips +FROM etc. +SELECT + gender + , ARRAY_AGG(numtrips order by year) AS numtrips +FROM ( + SELECT + gender + , EXTRACT(YEAR from starttime) AS year + , COUNT(1) AS numtrips + FROM + `bigquery-public-data`.new_york_citibike.citibike_trips + WHERE gender != 'unknown' and starttime IS NOT NULL + GROUP BY gender, year + HAVING year > 2016 +) +GROUP BY gender +AVG(numtrips)ARRAY_AGGARRAY[ + { + "gender": "male", + "numtrips": [ + "9306602", + "3955871" + ] + }, + { + "gender": "female", + "numtrips": [ + "3236735", + "1260893" + ] + } +] +numtripsWITH example AS ( + SELECT true AS is_vowel, 'a' as letter, 1 as position + UNION ALL SELECT false, 'b', 2 + UNION ALL SELECT false, 'c', 3 +) +SELECT ARRAY_AGG(IF(position = 2, NULL, position)) as positions from example +WITH example AS ( + SELECT true AS is_vowel, 'a' as letter, 1 as position + UNION ALL SELECT false, 'b', 2 + UNION ALL SELECT false, 'c', 3 +) +SELECT ARRAY_LENGTH(ARRAY_AGG(IF(position = 2, NULL, position))) from exampleSELECT + [ + STRUCT('male' as gender, [9306602, 3955871] as numtrips) + , STRUCT('female' as gender, [3236735, 1260893] as numtrips) + ] AS bikerides +SELECT + [ + ('male', [9306602, 3955871]) + , ('female', [3236735, 1260893]) + ] +SELECT + ARRAY_LENGTH(bikerides) as num_items + , bikerides[ OFFSET(0) ].gender as first_gender +FROM +(SELECT + [ + STRUCT('male' as gender, [9306602, 3955871] as numtrips) + , STRUCT('female' as gender, [3236735, 1260893] as numtrips) + ] AS bikerides) +SELECT + [ + STRUCT('male' as gender, [9306602, 3955871] as numtrips) + , STRUCT('female' as gender, [3236735, 1260893] as numtrips) + ] +SELECT * from UNNEST( + [ + STRUCT('male' as gender, [9306602, 3955871] as numtrips) + , STRUCT('female' as gender, [3236735, 1260893] as numtrips) + ]) +SELECT numtrips from UNNEST( + [ + STRUCT('male' as gender, [9306602, 3955871] as numtrips) + , STRUCT('female' as gender, [3236735, 1260893] as numtrips) + ]) +WITH bicycle_rentals AS ( SELECT COUNT(starttime) as num_trips, EXTRACT(DATE from starttime) as trip_date FROM `bigquery-public-data`.new_york_citibike.citibike_trips GROUP BY trip_date ), rainy_days AS ( SELECT date, (MAX(prcp) > 5) AS rainy FROM ( SELECT wx.date AS date, IF (wx.element = 'PRCP', wx.value/10, NULL) AS prcp FROM `bigquery-public-data`.ghcn_d.ghcnd_2016 AS wx WHERE wx.id = 'USW00094728' ) GROUP BY date ) SELECT ROUND(AVG(bk.num_trips)) AS num_trips, wx.rainy FROM bicycle_rentals AS bk JOIN rainy_days AS wx ON wx.date = bk.trip_date GROUP BY wx.rainyWITHcitibike_tripsbicycle_rentals.'USW00094728' WITH bicycle_rentals AS ( + SELECT + COUNT(starttime) as num_trips, + EXTRACT(DATE from starttime) as trip_date + FROM `bigquery-public-data`.new_york_citibike.citibike_trips + GROUP BY trip_date +) +SELECT * from bicycle_rentals LIMIT 5 +SELECT + bk.trip_date, bk.num_trips, wx.rainy FROM bicycle_rentals AS bk JOIN rainy_days AS wx ON wx.date = bk.trip_date LIMIT 5 +WITH from_item_a AS ( + SELECT 'Dalles' as city, 'OR' as state + UNION ALL SELECT 'Tokyo', 'Tokyo' + UNION ALL SELECT 'Mumbai', 'Maharashtra' +), + +from_item_b AS ( + SELECT 'OR' as state, 'USA' as country + UNION ALL SELECT 'Tokyo', 'Japan' + UNION ALL SELECT 'Maharashtra', 'India' +) + +SELECT from_item_a.*, country +FROM from_item_a +JOIN from_item_b +ON from_item_a.state = from_item_b.state +SELECT from_item_a.*, country AS surcharge +FROM from_item_a +JOIN from_item_b +ON from_item_a.state != from_item_b.state +WITH winners AS ( + SELECT 'John' as person, '100m' as event + UNION ALL SELECT 'Hiroshi', '200m' + UNION ALL SELECT 'Sita', '400m' +), +gifts AS ( + SELECT 'Google Home' as gift, '100m' as event + UNION ALL SELECT 'Google Hub', '200m' + UNION ALL SELECT 'Pixel3', '400m' +) +SELECT winners.*, gifts.gift +FROM winners +JOIN gifts +WITH winners AS ( + SELECT 'John' as person, '100m' as event + UNION ALL SELECT 'Hiroshi', '200m' + UNION ALL SELECT 'Sita', '400m' +), +gifts AS ( + SELECT 'Google Home' as gift + UNION ALL SELECT 'Google Hub' + UNION ALL SELECT 'Pixel3' +) +SELECT person, gift +FROM winners +CROSS JOIN gifts +SELECT from_item_a.*, from_item_b.* +FROM from_item_a +CROSS JOIN from_item_bSELECT from_item_a.*, from_item_b.* +FROM from_item_a, from_item_bWITH winners AS ( + SELECT 'John' as person, '100m' as event + UNION ALL SELECT 'Hiroshi', '200m' + UNION ALL SELECT 'Sita', '400m' + UNION ALL SELECT 'Kwame', '50m' +), +gifts AS ( + SELECT 'Google Home' as gift, '100m' as event + UNION ALL SELECT 'Google Hub', '200m' + UNION ALL SELECT 'Pixel3', '400m' + UNION ALL SELECT 'Google Mini', '5000m' +) +SELECT person, gift +FROM winners +INNER JOIN gifts +ON winners.event = gifts.event +SELECT person, gift +FROM winners +FULL OUTER JOIN gifts +ON winners.event = gifts.event +SELECT person, gift +FROM winners +LEFT OUTER JOIN gifts +ON winners.event = gifts.event +SELECT person, gift +FROM winners +RIGHT OUTER JOIN gifts +ON winners.event = gifts.event diff --git a/03_func/all_code.txt b/03_func/all_code.txt new file mode 100644 index 0000000..42c0e9e --- /dev/null +++ b/03_func/all_code.txt @@ -0,0 +1,298 @@ +WITH example AS ( + SELECT 'Sat' AS day, 1451 AS numrides, 1018 AS oneways + UNION ALL SELECT 'Sun', 2376, 936 +) +SELECT *, (oneways/numrides) AS frac_oneway from example + +CREATE TEMP FUNCTION lastElement(arr ANY TYPE) AS ( + arr[ORDINAL(ARRAY_LENGTH(arr))] +); +WITH example AS ( + SELECT 'Sat' AS day, 1451 AS numrides, 1018 AS oneways + UNION ALL SELECT 'Sun', 2376, 936 +) +SELECT *, ROUND(oneways/numrides, 2) AS frac_oneway from exampleWITH example AS ( + SELECT 'Sat' AS day, 1451 AS numrides, 1018 AS oneways + UNION ALL SELECT 'Sun', 2376, 936 + UNION ALL SELECT 'Wed', 0, 0 +) +SELECT + *, ROUND(IEEE_Divide(oneways, numrides), 2) +AS frac_oneway from example +NaNSAFESELECT LOG(10, -3), LOG(10, 3) +SELECT SAFE.LOG(10, -3), SAFE.LOG(10, 3) +NULLLOG(10, -3)SAFESUBSTRNULLSAFE.SUBSTRNULLNaN-infWITH example AS ( + SELECT 'Sat' AS day, 1451 AS numrides, 1018 AS oneways + UNION ALL SELECT 'Sun', 2376, 936 + UNION ALL SELECT 'Mon', NULL, NULL + UNION ALL SELECT 'Tue', IEEE_Divide(-3,0), 0 -- this is -inf,0 +) +SELECT * from example +ORDER BY numrides +SELECT * from example +WHERE numrides < 2000 +!genderWITH example AS ( + SELECT 1.23 AS payment + UNION ALL SELECT 7.89 + UNION ALL SELECT 12.43 +) +SELECT + SUM(payment) AS total_paid, + AVG(payment) AS average_paid +FROM example +WITH example AS ( + SELECT NUMERIC '1.23' AS payment + UNION ALL SELECT NUMERIC '7.89' + UNION ALL SELECT NUMERIC '12.43' +) +SELECT + SUM(payment) AS total_paid, + AVG(payment) AS average_paid +FROM example +NUMERIC '1.23')SELECT gender, tripduration FROM `bigquery-public-data`.new_york_citibike.citibike_trips WHERE (tripduration < 600 AND gender = 'female') OR gender = 'male'WITH example AS ( + SELECT NULL AS is_vowel, NULL as letter, -1 as position + UNION ALL SELECT true, 'a', 1 + UNION ALL SELECT false, 'b', 2 + UNION ALL SELECT false, 'c', 3 +) +SELECT * from example WHERE is_vowel != false +WITH example AS ( + SELECT NULL AS is_vowel, NULL as letter, -1 as position + UNION ALL SELECT true, 'a', 1 + UNION ALL SELECT false, 'b', 2 + UNION ALL SELECT false, 'c', 3 +) +SELECT * from example WHERE is_vowel IS NOT false +WITH example AS ( + SELECT NULL AS is_vowel, NULL as letter, -1 as position + UNION ALL SELECT true, 'a', 1 + UNION ALL SELECT false, 'b', 2 + UNION ALL SELECT false, 'c', 3 +) +SELECT * from example WHERE is_vowel +WITH catalog AS ( + SELECT 30.0 AS costPrice, 0.15 AS markup, 0.1 AS taxRate + UNION ALL SELECT NULL, 0.21, 0.15 + UNION ALL SELECT 30.0, NULL, 0.09 + UNION ALL SELECT 30.0, 0.30, NULL + UNION ALL SELECT 30.0, NULL, NULL +) +SELECT + *, ROUND( + costPrice * + IF(markup IS NULL, 1.05, 1+markup) * + IF(taxRate IS NULL, 1.10, 1+taxRate) + , 2) AS salesPrice +FROM catalog +WITH catalog AS ( + SELECT 30.0 AS costPrice, 0.15 AS markup, 0.1 AS taxRate + UNION ALL SELECT NULL, 0.21, 0.15 + UNION ALL SELECT 30.0, NULL, 0.09 + UNION ALL SELECT 30.0, 0.30, NULL + UNION ALL SELECT 30.0, NULL, NULL +) +SELECT + *, ROUND(COALESCE( + costPrice * (1+markup) * (1+taxrate), + costPrice * 1.05 * (1+taxrate), + costPrice * (1+markup) * 1.10, + NULL + ),2) AS salesPrice +FROM catalog +IFNULLCOALESCEIFNULL(a, b)COALESCE(a, b)ba NULL IFNULL(a, b) IF(a IS NULL, b, a)SELECT + *, ROUND( + costPrice * + (1 + IFNULL(markup, 0.05)) * + (1 + IFNULL(taxrate,0.10)) + , 2) AS salesPrice +FROM catalog + +WITH example AS ( + SELECT 'John' as employee, 'Paternity Leave' AS hours_worked + UNION ALL SELECT 'Janaki', '35' + UNION ALL SELECT 'Jian', 'Vacation' + UNION ALL SELECT 'Jose', '40' +) + +WITH example AS ( + SELECT 'John' as employee, 'Paternity Leave' AS hours_worked + UNION ALL SELECT 'Janaki', '35' + UNION ALL SELECT 'Jian', 'Vacation' + UNION ALL SELECT 'Jose', '40' +) +SELECT SUM(hours_worked) from example +SELECT CAST("true" AS bool), CAST("invalid" AS bool) +SELECT CAST("true" AS bool), SAFE_CAST("invalid" AS bool) +hours_workedWITH example AS ( + SELECT 'John' as employee, 'Paternity Leave' AS hours_worked + UNION ALL SELECT 'Janaki', '35' + UNION ALL SELECT 'Jian', 'Vacation' + UNION ALL SELECT 'Jose', '40' +) +SELECT SUM(SAFE_CAST(hours_worked AS INT64)) from example +WITH example AS ( + SELECT 'John' as employee, '0' AS hours_worked + UNION ALL SELECT 'Janaki', '35' + UNION ALL SELECT 'Jian', '0' + UNION ALL SELECT 'Jose', '40' +) +SELECT SUM(CAST(hours_worked AS INT64)) from example +WITH example AS ( + SELECT true AS is_vowel, 'a' as letter, 1 as position + UNION ALL SELECT false, 'b', 2 + UNION ALL SELECT false, 'c', 3 +) +SELECT * from example +SELECT SUM(is_vowel) as num_vowels from example + +WITH example AS ( + SELECT true AS is_vowel, 'a' as letter, 1 as position + UNION ALL SELECT false, 'b', 2 + UNION ALL SELECT false, 'c', 3 +) +SELECT SUM(CAST (is_vowel AS INT64)) as num_vowels from example +WITH example AS ( + SELECT true AS is_vowel, 'a' as letter, 1 as position + UNION ALL SELECT false, 'b', 2 + UNION ALL SELECT false, 'c', 3 +) +SELECT SUM(IF(is_vowel, 1, 0)) as num_vowels from example +WITH example AS ( + SELECT true AS is_vowel, 'a' as letter, 1 as position + UNION ALL SELECT false, 'b', 2 + UNION ALL SELECT false, 'c', 3 +) +SELECT COUNTIF(is_vowel) as num_vowels from example +WITH example AS ( + SELECT * from unnest([ + 'Seattle', 'New York', 'Singapore' + ]) AS city +) +SELECT + city + , LENGTH(city) AS len + , LOWER(city) AS lower + , STRPOS(city, 'or') AS orpos +FROM example +WITH example AS ( + SELECT 'armin@abc.com' AS email, 'Anapolis, MD' as city + UNION ALL SELECT 'boyan@bca.com', 'Boulder, CA' + UNION ALL SELECT 'carrie@cab.com', 'Chicago, IL' +) + +SELECT + CONCAT( + SUBSTR(email, 1, STRPOS(email, '@') - 1), -- username + ' from ', city) AS callers +FROM example +WITH example AS ( + SELECT * from unnest([ + 'Seattle', 'New York', 'சிங்கப்பூர்', '東京' + ]) AS city +) +SELECT + city + , UPPER(city) AS allcaps + , CAST(city AS BYTES) as bytes +FROM example +WITH example AS ( + SELECT * from unnest([ + 'Seattle', 'New York', 'சிங்கப்பூர்', '東京' + ]) AS city +) +SELECT + city + , CHAR_LENGTH(city) as char_len + , TO_CODE_POINTS(city)[OFFSET(1)] as first_code_point + , ARRAY_LENGTH(TO_CODE_POINTS(city)) as num_code_points + , CAST (city AS BYTES) as bytes + , BYTE_LENGTH(city) as byte_len +FROM example +SELECT + CAST(42 AS STRING) + , CAST('42' AS INT64) + , FORMAT('%03d', 42) + , FORMAT('%5.3f', 32.457842) + , FORMAT('%5.3f', 32.4) + , FORMAT('**%s**', 'H') + , FORMAT('%s-%03d', 'Agent', 7) +SELECT + ENDS_WITH('Hello', 'o') -- true + , ENDS_WITH('Hello', 'h') -- false + , STARTS_WITH('Hello', 'h') -- false + , STRPOS('Hello', 'e') -- 2 + , STRPOS('Hello', 'f') -- 0 for not-found + , SUBSTR('Hello', 2, 4) -- 1-based + , CONCAT('Hello', 'World') +SELECT + LPAD('Hello', 10, '*') -- left pad with * + , RPAD('Hello', 10, '*') -- right pad + , LPAD('Hello', 10) -- left pad with spaces + , LTRIM(' Hello ') -- trim whitespace on left + , RTRIM(' Hello ') -- trim whitespace on right + , TRIM (' Hello ') -- trim whitespace both ends + , TRIM ('***Hello***', '*') -- trim * both ends + , REVERSE('Hello') -- reverse the string +SELECT + column + , REGEXP_CONTAINS(column, r'\d{5}(?:[-\s]\d{4})?') has_zipcode + , REGEXP_CONTAINS(column, r'^\d{5}(?:[-\s]\d{4})?$') is_zipcode + , REGEXP_EXTRACT(column, r'\d{5}(?:[-\s]\d{4})?') the_zipcode + , REGEXP_EXTRACT_ALL(column, r'\d{5}(?:[-\s]\d{4})?') all_zipcodes + , REGEXP_REPLACE(column, r'\d{5}(?:[-\s]\d{4})?', '*****') masked +FROM ( + SELECT * from unnest([ + '12345', '1234', '12345-9876', + 'abc 12345 def', 'abcde-fghi', + '12345 ab 34567', '12345 9876' + ]) AS column +) +rSELECT t1, t2, TIMESTAMP_DIFF(t1, t2, MICROSECOND) +FROM (SELECT + TIMESTAMP "2017-09-27 12:30:00.45" AS t1, + TIMESTAMP "2017-09-27 13:30:00.45+1" AS t2 +) +SELECT + fmt, input, zone + , PARSE_TIMESTAMP(fmt, input, zone) AS ts +FROM ( + SELECT '%Y%m%d-%H%M%S' AS fmt, '20181118-220800' AS input, '+0' as zone + UNION ALL SELECT '%c', 'Sat Nov 24 21:26:00 2018', 'America/Los_Angeles' + UNION ALL SELECT '%x %X', '11/18/18 22:08:00', 'UTC' +) +SELECT + ts, fmt + , FORMAT_TIMESTAMP(fmt, ts, '+6') AS ts_output +FROM ( + SELECT CURRENT_TIMESTAMP() AS ts, '%Y%m%d-%H%M%S' AS fmt + UNION ALL SELECT CURRENT_TIMESTAMP() AS ts, '%c' AS fmt + UNION ALL SELECT CURRENT_TIMESTAMP() AS ts, '%x %X' AS fmt +) +EXTRACT(WEEK('SATURDAY') FROM ts)SELECT + UNIX_MILLIS(TIMESTAMP "2018-11-25 22:30:00 UTC") + , UNIX_MILLIS(TIMESTAMP "1918-11-11 22:30:00 UTC") --invalid + , TIMESTAMP_MILLIS(1543185000000) +SELECT + EXTRACT(TIME FROM TIMESTAMP_ADD(t1, INTERVAL 1 HOUR)) AS plus_1h + , EXTRACT(TIME FROM TIMESTAMP_SUB(t1, INTERVAL 10 MINUTE)) AS minus_10min + , TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), + TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 MINUTE), + SECOND) AS plus_1min + , TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), + TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 1 MINUTE), + SECOND) AS minus_1min +FROM (SELECT + TIMESTAMP "2017-09-27 12:30:00.45" AS t1 +) +SELECT + EXTRACT(DATETIME FROM CURRENT_TIMESTAMP()) as dt + , CAST(CURRENT_DATETIME() AS TIMESTAMP) as ts +ST_GeogPoint(-122.33, 47.61) +SELECT + state_name +FROM `bigquery-public-data`.utility_us.us_states_area +WHERE + ST_Contains( + state_geom, + ST_GeogPoint(-122.33, 47.61)) +state_geom diff --git a/04_load/all_code.txt b/04_load/all_code.txt new file mode 100644 index 0000000..4164c79 --- /dev/null +++ b/04_load/all_code.txt @@ -0,0 +1,366 @@ +SELECT + state_name +FROM `bigquery-public-data`.utility_us.us_states_area +WHERE + ST_Contains( + state_geom, + ST_GeogPoint(-122.33, 47.61)) +(-122.33, 47.61)state_namestate_geomFROMbigquery-public-data us_states_areautility_us.utility_us us_states_areazlesszless college_scorecard.csv.gz +bq --location=US mk ch04 +bqbqmkch04us-east4europe-west2australia-southeast1bq --location=US \ + load \ + --source_format=CSV --autodetect \ + ch04.college_scorecard \ + ./college_scorecard.csv.gz +Could not parse 'NULL' as int for field HBCU (position 26) starting at location 11945910 +CSV table encountered too many errors, giving up. Rows: 591; errors: 1. +HBCUNULLNULL --max_bad_records=20NULLbq --location=US \ + load --null_marker=NULL \ + --source_format=CSV --autodetect \ + ch04.college_scorecard \ + ./college_scorecard.csv.gz +bq loadbq load --helpbq --location=US \ + load --null_marker=NULL --replace \ + --source_format=CSV --autodetect \ + ch04.college_scorecard \ + ./college_scorecard.csv.gz +--replace=falsech04SELECT + INSTNM + , ADM_RATE_ALL + , FIRST_GEN + , MD_FAMINC + , MD_EARN_WNE_P10 + , SAT_AVG +FROM + ch04.college_scorecard +WHERE + SAFE_CAST(SAT_AVG AS FLOAT64) > 1300 + AND SAFE_CAST(ADM_RATE_ALL AS FLOAT64) < 0.2 + AND SAFE_CAST(FIRST_GEN AS FLOAT64) > 0.1 +ORDER BY + CAST(MD_FAMINC AS FLOAT64) ASC +SAFE_CAST(ADM_RATE_ALL AS FLOAT64)No matching signature for operator > for argument types: STRING, INT64.PrivacySuppressedBad double value: PrivacySuppressed; while executing the filter ... +PrivacySuppressedzless ./college_scorecard.csv.gz | \ + sed 's/PrivacySuppressed/NULL/g' | \ + gzip > /tmp/college_scorecard.csv.gz +sedSAT_AVGADM_RATEbq show --format prettyjson --schema ch04.college_scorecard +bq show --format prettyjson --schema ch04.college_scorecard > schema.json +SELECT + table_name + , column_name + , ordinal_position + , is_nullable + , data_type +FROM + ch04.INFORMATION_SCHEMA.COLUMNS +TO_JSON_STRINGSELECT + TO_JSON_STRING( + ARRAY_AGG(STRUCT( + IF(is_nullable = 'YES', 'NULLABLE', 'REQUIRED') AS mode, + column_name AS name, + data_type AS type) + ORDER BY ordinal_position), TRUE) AS schema +FROM + ch04.INFORMATION_SCHEMA.COLUMNS +WHERE + table_name = 'college_scorecard' +[ + { + "mode": "NULLABLE", + "name": "INSTNM", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "ADM_RATE_ALL", + "type": "FLOAT64" + }, +... +SAT_AVG, ADM_RATE_ALL, FIRST_GEN MD_FAMINC) FLOAT64: + { + "mode": "NULLABLE", + "name": "FIRST_GEN", + "type": "FLOAT64" + }, + +T4APPROVALDATESTRING { + "mode": "NULLABLE", + "name": "T4APPROVALDATE", + "type": "STRING" + }, + +bq --location=US \ + load --null_marker=NULL --replace \ + --source_format=CSV \ + --schema=schema.json --skip_leading_rows=1 \ + ch04.college_scorecard \ + ./college_scorecard.csv.gz + +SELECT + INSTNM + , ADM_RATE_ALL + , FIRST_GEN + , MD_FAMINC + , MD_EARN_WNE_P10 + , SAT_AVG +FROM + ch04.college_scorecard +WHERE + SAT_AVG > 1300 + AND ADM_RATE_ALL < 0.2 + AND FIRST_GEN > 0.1 +ORDER BY + MD_FAMINC ASC +CREATE TABLECREATE OR REPLACE TABLE ch04.college_scorecard_etl AS + SELECT + INSTNM + , ADM_RATE_ALL + , FIRST_GEN + , MD_FAMINC + , SAT_AVG + , MD_EARN_WNE_P10 + FROM ch04.college_scorecard +bq rm ch04.college_scorecard +bq rm -r -f ch04 +DROP TABLE IF EXISTS ch04.college_scorecard_gcs +ALTER TABLE SET OPTIONSALTER TABLE ch04.college_scorecard + SET OPTIONS ( + expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), + INTERVAL 7 DAY), + description="College Scorecard expires seven days from now" + ) +DROP TABLEALTER TABLECREATE TABLEDELETE FROM ch04.college_scorecard +WHERE SAT_AVG IS NULL +INSERT ch04.college_scorecard + (INSTNM + , ADM_RATE_ALL + , FIRST_GEN + , MD_FAMINC + , SAT_AVG + , MD_EARN_WNE_P10 + ) + VALUES ('abc', 0.1, 0.3, 12345, 1234, 23456), + ('def', 0.2, 0.2, 23451, 1232, 32456) +INSERT ch04.college_scorecard +SELECT * +FROM ch04.college_scorecard_etl +WHERE SAT_AVG IS NULL +SQL COPYbq cpbq cp ch04.college_scorecard someds.college_scorecard_copy +bq cp-a--append_table-noappend_tableCREATE TABLE AS SELECTINSERT VALUESbq cpgsutil cpbq loadgsutil -m cp *.csv gs://BUCKET/some/location +bqload … gs://BUCKET/some/location/*.csv +bq loadbq loadbq loadbq mkdef +bq mkbq mkdefbq load--autodetect: +bq mkdef --source_format=CSV \ + --autodetect \ + gs://bigquery-oreilly-book/college_scorecard.csv +bq mkbq mkdef --source_format=CSV \ + --autodetect \ + gs://bigquery-oreilly-book/college_scorecard.csv \ + > /tmp/mytable.json +bq mk --external_table_definition=/tmp/mytable.json \ + ch04.college_scorecard + +course_grades.csv-00095-of-00313bq mkdef --source_format=CSV \ + --autodetect \ + gs://bigquery-oreilly-book/college_* \ + > /tmp/mytable.json +LOC="--location US" +INPUT=gs://bigquery-oreilly-book/college_scorecard.csv + +SCHEMA=$(gsutil cat $INPUT | head -1 | awk -F, '{ORS=","}{for (i=1; i <= NF; i++){ print $i":STRING"; }}' | sed 's/,$//g'| cut -b 4- ) + +bq $LOC query \ + --external_table_definition=cstable::${SCHEMA}@CSV=${INPUT} \ + 'SELECT SUM(IF(SAT_AVG != "NULL", 1, 0))/COUNT(SAT_AVG) FROM cstable' +--external_table_definition=cstable::${DEF}bq mkdef --source_format=PARQUET gs://bucket/dir/files* > table_def.json +bq mk --external_table_definition=table_def.json . +bq load --source_format=ORC --autodetect \ + --hive_partitioning_mode=AUTO .
+datestampgs://some-bucket/some-dir/some-table/* +gs://some-bucket/some-dir/some-table/datestamp= +STRINGINTEGERDATETIMESTAMPbq load --source_format=ORC --autodetect \ + --hive_partitioning_mode=STRINGS .
bq mkdef --source_format=ORC --autodetect \ + --hive_partitioning_mode=AUTO > table_def.jsonbq mkdef --source_format=NEWLINE_DELIMITED_JSON --autodetect --hive_partitioning_mode=STRINGS > table_def.json +bq mkdefFIELD1:DATATYPE1,FIELD2:DATATYPE2,... +FIELD1,FIELD2,FIELD3,,...INPUT=gs://bigquery-oreilly-book/college_scorecard.csv +SCHEMA=$(gsutil cat $INPUT | head -1 | cut -b 4- ) +sedLOC="--location US" +OUTPUT=/tmp/college_scorecard_def.json +bq $LOC \ + mkdef \ + --source_format=CSV \ + --noautodetect \ + $INPUT \ + $SCHEMA \ + | sed 's/"skipLeadingRows": 0/"skipLeadingRows": 1/g' \ + | sed 's/"allowJaggedRows": false/"allowJaggedRows": true/g' \ + > $OUTPUT +SELECT + MAX(CAST(SAT_AVG AS FLOAT64)) AS MAX_SAT_AVG +FROM + `ch04.college_scorecard_gcs` +Bad double value: NULL +NULLWITH etl_data AS ( + SELECT + SAFE_CAST(SAT_AVG AS FLOAT64) AS SAT_AVG + FROM + `ch04.college_scorecard_gcs` +) +SELECT + MAX(SAT_AVG) AS MAX_SAT_AVG +FROM + etl_data +CREATE TEMP FUNCTION cleanup_numeric(x STRING) AS +( + IF ( x != 'NULL' AND x != 'PrivacySuppressed', + CAST(x as FLOAT64), + NULL ) +); + +WITH etl_data AS ( + SELECT + INSTNM + , cleanup_numeric(ADM_RATE_ALL) AS ADM_RATE_ALL + , cleanup_numeric(FIRST_GEN) AS FIRST_GEN + , cleanup_numeric(MD_FAMINC) AS MD_FAMINC + , cleanup_numeric(SAT_AVG) AS SAT_AVG + , cleanup_numeric(MD_EARN_WNE_P10) AS MD_EARN_WNE_P10 + FROM + `ch04.college_scorecard_gcs` +) + +SELECT + * +FROM + etl_data +WHERE + SAT_AVG > 1300 + AND ADM_RATE_ALL < 0.2 + AND FIRST_GEN > 0.1 +ORDER BY + MD_FAMINC ASC +LIMIT 10 +SELECT *CREATE TABLECREATE TEMP FUNCTION cleanup_numeric(x STRING) AS +( + IF ( x != 'NULL' AND x != 'PrivacySuppressed', + CAST(x as FLOAT64), + NULL ) +); + +CREATE TABLE ch04.college_scorecard_etl +OPTIONS(description="Cleaned up college scorecard data") AS + +WITH etl_data AS ( + SELECT + INSTNM + , cleanup_numeric(ADM_RATE_ALL) AS ADM_RATE_ALL + , cleanup_numeric(FIRST_GEN) AS FIRST_GEN + , cleanup_numeric(MD_FAMINC) AS MD_FAMINC + , cleanup_numeric(SAT_AVG) AS SAT_AVG + , cleanup_numeric(MD_EARN_WNE_P10) AS MD_EARN_WNE_P10 + FROM + `ch04.college_scorecard_gcs` +) + +SELECT * FROM etl_data + +CREATE TABLEbq query--destination_tableJSON_EXTRACTprotopayload_auditlog.metadataJsontableDataReadSELECT + REGEXP_EXTRACT(protopayload_auditlog.resourceName, '^projects/[^/]+/datasets/([^/]+)/tables') AS datasetRef, + COUNTIF(JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.tableDataRead") IS NOT NULL) AS dataReadEvents, + FROM ch04.cloudaudit_googleapis_com_data_access_2019* + WHERE + JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.tableDataRead") IS NOT NULL + GROUP BY datasetRef + ORDER BY dataReadEvents DESC + LIMIT 5 +JSON_EXTRACTprotopayload_auditlog.metadataJsonsomedbmysql somedb < select_data.sql | \ + gsutil cp - gs://BUCKET/data_$(date -u "+%F-%T").tsvselect_data.sqlselect * from my_table +where transaction_date >= DATE_SUB(CURDATE(), INTERVAL 10 DAY) +https://sheets.newSELECT * from advdata.students +SELECT + * +FROM + ch04.college_scorecard_etl=ArrayFormula(IF(ISBLANK(D2:D), 0, F2:F/D2:D)) +stateRowsyearColumnsValuesnumberCOUNT_UNIQUEDefaultcollege_scorecard_gsSELECT INSTNM, COUNT(display_name) AS numusers +FROM `bigquery-public-data`.stackoverflow.users, ch04.college_scorecard_gs +WHERE REGEXP_CONTAINS(about_me, INSTNM) +GROUP BY INSTNM +ORDER BY numusers DESC +LIMIT 5 +GOOG#buy#20190119-090356.0322234setup_data.shhttps://googleapis.com/bigtable/projects/[PROJECT_ID]/instances/[INSTANCE_ID]/tables/[TABLE_NAME]. PROJECT_IDINSTANCE_IDTABLE_NAMElogs-tablesalesSELECT SUM(sales.qty.cell.value) AS num_sold +FROM ch04.logs +WHERE sales.itemid.cell.value = '12345' +asia-northeast1,EUbigquery.adminCREATE OR REPLACE TABLE +ch04.college_scorecard_dts +AS +SELECT * FROM ch04.college_scorecard_gcs +LIMIT 0 +college_scorecard_dts CREATE TABLE +ch04.college_scorecard_valid_sat +AS +SELECT * FROM ch04.college_scorecard_gcs +WHERE LENGTH(SAT_AVG) > 0 +ch04.college_scorecard_valid_satch04.college_scorecard_gcs CREATE TABLECREATE OR REPLACECREATE IF NOT EXISTSCREATE TABLE ch04.payment_transactions +( + PAYEE STRING OPTIONS(description="Id of payee"), + AMOUNT NUMERIC OPTIONS(description="Amount paid") +) +bq mk --transfer_config --data_source=google_cloud_storage \ +--target_dataset=ch04 --display_name ch04_college_scorecard \ +--params='{"data_path_template":"gs://bigquery-oreilly-book/college_*.csv", "destination_table_name_template":"college_scorecard_dts", "file_format":"CSV", "max_bad_records":"10", "skip_leading_rows":"1", "allow_jagged_rows":"true"}' +youtube_channelch04mytable_{run_time|"%Y%m%d"} +datetimemytable_{run_date}{run_time+45m|"%Y%m%d"}_mytable_{run_time|"%H%M%s"} +20180915_mytable_004500run_daterun_timeSELECT + gender, AVG(tripduration / 60) AS avg_trip_duration +FROM + `bigquery-public-data`.new_york_citibike.citibike_trips +GROUP BY + gender +HAVING avg_trip_duration > 14 +ORDER BY + avg_trip_duration +SELECT protopayload_auditlog.status.message FROM ch04.cloudaudit_googleapis_com_data_access_20190128 + + INPATTERNS = 'gs://bigquery-oreilly-book/college_*.csv' + RUNNER = 'DataflowRunner' + with beam.Pipeline(RUNNER, options = opts) as p: + (p + | 'read' >> beam.io.ReadFromText(INPATTERNS, skip_header_lines=1) + | 'parse_csv' >> beam.FlatMap(parse_csv) + | 'pull_fields' >> beam.FlatMap(pull_fields) + | 'write_bq' >> beam.io.gcp.bigquery.WriteToBigQuery(bqtable, bqdataset, schema=get_output_schema()) + ) +parse_csvdef parse_csv(line): + try: + values = line.split(',') + rowdict = {} + for colname, value in zip(COLNAMES, values): + rowdict[colname] = value + yield rowdict + except: + logging.warn('Ignoring line ...')parse_csvpull_fieldsINSTNMdef pull_fields(rowdict): + result = {} + # required string fields + for col in 'INSTNM'.split(','): + if col in rowdict: + result[col] = rowdict[col] + else: + logging.info('Ignoring line missing {}', col) + return + + # float fields + for col in 'ADM_RATE_ALL,FIRST_GEN,MD_FAMINC,SAT_AVG,MD_EARN_WNE_P10'.split(','): + try: + result[col] = (float) (rowdict[col]) + except: + result[col] = None + yield result +beam.io.gcp.bigquery.WriteToBigQueryINSTNM:string,ADM_RATE_ALL:FLOAT64,FIRST_GEN:FLOAT64,... +# create an array of tuples and insert as data becomes available +rows_to_insert = [ + (u'U. Puerto Rico', 0.18,0.46,23000,1134,32000), + (u'Guam U.', 0.43,0.21,28000,1234,33000) +] +errors = client.insert_rows(table, rows_to_insert) # API request +bq load gsutilgsutil -m cp /some/dir/myfiles*.csv gs://bucket/some/dir +bq loadgsutil bq load diff --git a/05_devel/all_code.txt b/05_devel/all_code.txt new file mode 100644 index 0000000..1510b0b --- /dev/null +++ b/05_devel/all_code.txt @@ -0,0 +1,684 @@ +bqhttps://www.googleapis.com/bigquery/v2/projects//datasets/ +https://www.googleapis.com/bigquery/v2curlDELETE/projects//datasets/ /projects//datasets//tables/
+.../projects//datasets//tables +#!/bin/bash +PROJECT=$(gcloud config get-value project) +access_token=$(gcloud auth application-default print-access-token) +curl -H "Authorization: Bearer $access_token" \ + -H "Content-Type: application/json" \ + -X GET "https://www.googleapis.com/bigquery/v2/projects/$PROJECT/datasets/ch04/tables" +.../projects/$PROJECT/datasets/ch04/tablescurlINFORMATION_SCHEMASELECT + table_name, creation_time +FROM + ch04.INFORMATION_SCHEMA.TABLES +INFORMATION_SCHEMACREATE TABLE +CREATE TABLE IF NOT EXISTS +CREATE OR REPLACE TABLE +ALTER TABLE SET OPTIONS +ALTER TABLE IF EXISTS SET OPTIONS +INSERT INTO +DELETE FROM +UPDATE +MERGE +DROP TABLE +INFORMATION_SCHEMA.SCHEMATA +INFORMATION_SCHEMA.SCHEMATA_OPTIONS +INFORMATION_SCHEMA.TABLES +INFORMATION_SCHEMA.TABLE_OPTIONS +INFORMATION_SCHEMA.COLUMNS +INFORMATION_SCHEMA.COLUMN_FIELD_PATHS +INFORMATION_SCHEMA.JOBS_BY_USER +INFORMATION_SCHEMA.JOBS_BY_PROJECT +INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION +.../projects//queries{ + "useLegacySql": false, + "query": \"${QUERY_TEXT}\" +} +QUERY_TEXTread -d '' QUERY_TEXT << EOF +SELECT + start_station_name + , AVG(duration) as duration + , COUNT(duration) as num_trips +FROM \`bigquery-public-data\`.london_bicycles.cycle_hire +GROUP BY start_station_name +ORDER BY num_trips DESC +LIMIT 5 +EOF +curl -H "Authorization: Bearer $access_token" \ + -H "Content-Type: application/json" \ + -X POST \ + -d "$request" \ + "https://www.googleapis.com/bigquery/v2/projects/$PROJECT/queries" +$request"schema": { + "fields": [ + { + "name": "start_station_name", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "duration", + "type": "FLOAT", + "mode": "NULLABLE" + }, + { + "name": "num_trips", + "type": "INTEGER", + "mode": "NULLABLE" + } + ] + }, +{ + "f": [ + { + "v": "Belgrove Street , King's Cross" + }, + { + "v": "1011.0766960393793" + }, + { + "v": "234458" + } + ] +}, +fv{ + "useLegacySql": false, + "timeoutMs": 0, + "useQueryCache": false, + "query": \"${QUERY_TEXT}\" +} +{ + "kind": "bigquery#queryResponse", + "jobReference": { + "projectId": "cloud-training-demos", + "jobId": "job_gv0Kq8nWzXIkuBwoxsKMcTJIVbX4", + "location": "EU" + }, + "jobComplete": false +} +.../projects//jobs/ +jobComplete.../projects//queries/pip install google-cloud-bigquery +from google.cloud import bigquery +bq = bigquery.Client(project=PROJECT) +bqdsinfo = bq.get_dataset('bigquery-public-data.london_bicycles') +dsinfo = bq.get_dataset('ch04') +dsinfoprint(dsinfo.dataset_id) +print(dsinfo.created) +ch04ch04 +2019-01-26 00:41:01.350000+00:00 +print('{} created on {} in {}'.format( + dsinfo.dataset_id, dsinfo.created, dsinfo.location)) +bigquery-public-data.london_bicycleslondon_bicycles created on 2017-05-25 13:26:18.055000+00:00 in EU +dsinfoREADERfor access in dsinfo.access_entries: + if access.role == 'READER': + print(access) + + + +ch05dataset_id = "{}.ch05".format(PROJECT) +ds = bq.create_dataset(dataset_id, exists_ok=True) +Datasetdsinfolocationcreate_datasetDatasetdataset_iddataset_id = "{}.ch05eu".format(PROJECT) +dsinfo = bigquery.Dataset(dataset_id) +dsinfo.location = 'EU' +ds = bq.create_dataset(dsinfo, exists_ok=True) +ch05bq.delete_dataset('ch05', not_found_ok=True) +bq.delete_dataset('{}.ch05'.format(PROJECT), not_found_ok=True)dsinfoupdate_datasetdsinfo = bq.get_dataset("ch05") +print(dsinfo.description) +dsinfo.description = "Chapter 5 of BigQuery: The Definitive Guide" +dsinfo = bq.update_dataset(dsinfo, ['description']) +print(dsinfo.description) +printNonech05update_datasetNone +Chapter 5 of BigQuery: The Definitive Guide +dsinfo = bq.get_dataset("ch05") +entry = bigquery.AccessEntry( + role="READER", + entity_type="userByEmail", + entity_id="xyz@google.com", +) +if entry not in dsinfo.access_entries: + entries = list(dsinfo.access_entries) + entries.append(entry) + dsinfo.access_entries = entries + dsinfo = bq.update_dataset(dsinfo, ["access_entries"]) # API request +else: + print('{} already has access'.format(entry.entity_id)) +print(dsinfo.access_entries) +list_tablestables = bq.list_tables("bigquery-public-data.london_bicycles") +for table in tables: + print(table.table_id) +cycle_hire +cycle_stations +COUNT(*)table = bq.get_table( + "bigquery-public-data.london_bicycles.cycle_stations") +print('{} rows in {}'.format(table.num_rows, table.table_id)) +787 rows in cycle_stations +table = bq.get_table( + "bigquery-public-data.london_bicycles.cycle_stations") +for field in table.schema: + if 'count' in field.name: + print(field) +SchemaField('bikes_count', 'INTEGER', 'NULLABLE', '', ()) +SchemaField('docks_count', 'INTEGER', 'NULLABLE', '', ()) +bq.delete_table('ch05.temp_table', not_found_ok=True) +bq --location=US cp ch05.temp_table@1418864998000 ch05.temp_table2 +1418864998000 table_id = '{}.ch05.temp_table'.format(PROJECT) +table = bq.create_table(table_id, exists_ok=True) + +schema = [ + bigquery.SchemaField("chapter", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("title", "STRING", mode="REQUIRED"), +] +table_id = '{}.ch05.temp_table'.format(PROJECT) +table = bq.get_table(table_id) +print(table.etag) +table.schema = schema +table = bq.update_table(table, ["schema"]) +print(table.schema) +print(table.etag) +get_tableupdate_tabletable.etagNoneNULLABLEREQUIREDNULLABLErows = [ + (1, u'What is BigQuery?'), + (2, u'Query essentials'), +] +errors = bq.insert_rows(table, rows) +rows = [ + ('3', u'Operating on data types'), + ('wont work', u'This will fail'), + ('4', u'Loading data into BigQuery'), +] +errors = bq.insert_rows(table, rows) +print(errors) +reasoninvalidindex=1location=chapter{'index': 1, 'errors': [{'reason': 'invalid', 'debugInfo': '', 'message': 'Cannot convert value to integer (bad value):wont work', 'location': 'chapter'}]} + +reasonstopped{'index': 0, 'errors': [{'reason': 'stopped', 'debugInfo': '', 'message': '', 'location': ''}]} +rows = [ + (1, u'What is BigQuery?'), + (2, u'Query essentials'), +] +print(table.table_id, table.num_rows) +errors = bq.insert_rows(table, rows) +print(errors) +table = bq.get_table(table_id) +print(table.table_id, table.num_rows) # DELAYED +table.num_rowsSELECT DISTINCT(chapter) FROM ch05.temp_table +schema = [ + bigquery.SchemaField("chapter", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("title", "STRING", mode="REQUIRED"), +] +table_id = '{}.ch05.temp_table2'.format(PROJECT) +table = bigquery.Table(table_id, schema) +table = bq.create_table(table, exists_ok=True) +print('{} created on {}'.format(table.table_id, table.created)) +print(table.schema) +temp_table2 created on 2019-03-03 19:30:18.324000+00:00 +[SchemaField('chapter', 'INTEGER', 'REQUIRED', None, ()), SchemaField('title', 'STRING', 'REQUIRED', None, ())]import pandas as pd +data = [ + (1, u'What is BigQuery?'), + (2, u'Query essentials'), +] +df = pd.DataFrame(data, columns=['chapter', 'title']) + + +table_id = '{}.ch05.temp_table3'.format(PROJECT) +job = bq.load_table_from_dataframe(df, table_id) +job.result() # blocks and waits +print("Loaded {} rows into {}".format(job.output_rows, + tblref.table_id)) +from google.cloud.bigquery.job \ + import LoadJobConfig, WriteDisposition, CreateDisposition +load_config = LoadJobConfig( + create_disposition=CreateDisposition.CREATE_IF_NEEDED, + write_disposition=WriteDisposition.WRITE_TRUNCATE) +job = bq.load_table_from_dataframe(df, table_id, + job_config=load_config) +CreateDispositionWriteDispositionCreateDisposition +WriteDisposition +CREATE_NEVER +WRITE_APPEND + +WRITE_EMPTY + +WRITE_TRUNCATE +CREATE_IF_NEEDED +WRITE_APPEND +job_config +WRITE_EMPTY + +WRITE_TRUNCATE +CreateDispositionWriteDisposition job_config = bigquery.LoadJobConfig() +job_config.autodetect = True +job_config.source_format = bigquery.SourceFormat.CSV +job_config.null_marker = 'NULL' +uri = "gs://bigquery-oreilly-book/college_scorecard.csv" +table_id = '{}.ch05.college_scorecard_gcs'.format(PROJECT) +job = bq.load_table_from_uri(uri, table_id, job_config=job_config) +while not job.done(): + print('.', end='', flush=True) + time.sleep(0.1) +print('Done') +table = bq.get_table(tblref) +print("Loaded {} rows into {}.".format(table.num_rows, table.table_id)) +load_table_from_filewith gzip.open('../04_load/college_scorecard.csv.gz') as fp: + job = bq.load_table_from_file(fp, tblref, job_config=job_config) +source_tbl = 'bigquery-public-data.london_bicycles.cycle_stations' +dest_tbl = '{}.ch05eu.cycle_stations_copy'.format(PROJECT) +job = bq.copy_table(source_tbl, dest_tbl, location='EU') +job.result() # blocks and waits +dest_table = bq.get_table(dest_tbl) +print(dest_table.num_rows) +extract_tablesource_tbl = 'bigquery-public-data.london_bicycles.cycle_stations' +dest_uri = 'gs://{}/tmp/exported/cycle_stations'.format(BUCKET) +config = bigquery.job.ExtractJobConfig( + destination_format = + bigquery.job.DestinationFormat.NEWLINE_DELIMITED_JSON) +job = bq.extract_table(source_tbl, dest_uri, + location='EU', job_config=config) +job.result() # blocks and waits +tabledata.listcycle_stations table_id = 'bigquery-public-data.london_bicycles.cycle_stations' +table = bq.get_table(table_id) +rows = bq.list_rows(table, + start_index=0, + max_results=5) + +start_indexmax_resultsrows = bq.list_rows(table) + +page_size = 10000 +row_iter = bq.list_rows(table, + page_size=page_size) +for page in row_iter.pages: + rows = list(page) + # do something with rows ... + print(len(rows)) +idcountfields = [field for field in table.schema + if 'count' in field.name or field.name == 'id'] +rows = bq.list_rows(table, + start_index=300, + max_results=5, + selected_fields=fields) +fmt = '{!s:<10} ' * len(rows.schema) +print(fmt.format(*[field.name for field in rows.schema])) +for row in rows: + print(fmt.format(*row)) +id bikes_count docks_count +658 20 30 +797 20 30 +238 21 32 +578 22 32 +477 26 36 query = """ +SELECT + start_station_name + , AVG(duration) as duration + , COUNT(duration) as num_trips +FROM `bigquery-public-data`.london_bicycles.cycle_hire +GROUP BY start_station_name +ORDER BY num_trips DESC +LIMIT 10 +""" +config = bigquery.QueryJobConfig() +config.dry_run = True +job = bq.query(query, location='EU', job_config=config) +print("This query will process {} bytes." + .format(job.total_bytes_processed)) +This query will process 903989528 bytes. +forjob = bq.query(query, location='EU') +fmt = '{!s:<40} {:>10d} {:>10d}' +for row in job: + fields = (row['start_station_name'], + (int)(0.5 + row['duration']), + row['num_trips']) + print(fmt.format(*fields)) +SELECTnum_tripsBelgrove Street , King's Cross 1011 234458 +Hyde Park Corner, Hyde Park 2783 215629 +Waterloo Station 3, Waterloo 866 201630 +Black Lion Gate, Kensington Gardens 3588 161952 +Albert Gate, Hyde Park 2359 155647 +Waterloo Station 1, Waterloo 992 145910 +Wormwood Street, Liverpool Street 976 119447 +Hop Exchange, The Borough 1218 115135 +Wellington Arch, Hyde Park 2276 110260 +Triangle Car Park, Hyde Park 2233 108347 +query = """ +SELECT + start_station_name + , AVG(duration) as duration + , COUNT(duration) as num_trips +FROM `bigquery-public-data`.london_bicycles.cycle_hire +GROUP BY start_station_name +""" +df = bq.query(query, location='EU').to_dataframe() +print(df.describe()) +describe duration num_trips +count 880.000000 880.000000 +mean 1348.351153 27692.273864 +std 434.057829 23733.621289 +min 0.000000 1.000000 +25% 1078.684974 13033.500000 +50% 1255.889223 23658.500000 +75% 1520.504055 35450.500000 +max 4836.380090 234458.000000 + +min_durationquery2 = """ +SELECT + start_station_name + , COUNT(duration) as num_trips +FROM `bigquery-public-data`.london_bicycles.cycle_hire +WHERE duration >= @min_duration +GROUP BY start_station_name +ORDER BY num_trips DESC +LIMIT 10 +""" +@min_durationquery2 = """ +SELECT + start_station_name + , COUNT(duration) as num_trips +FROM `bigquery-public-data`.london_bicycles.cycle_hire +WHERE duration >= {} +GROUP BY start_station_name +ORDER BY num_trips DESC +LIMIT 10 +""".format(min_duration) +job_configconfig = bigquery.QueryJobConfig() +config.query_parameters = [ + bigquery.ScalarQueryParameter('min_duration', "INT64", 600) +] +job = bq.query(query2, location='EU', job_config=config) + +fmt = '{!s:<40} {:>10d}' +for row in job: + fields = (row['start_station_name'], + row['num_trips']) + print(fmt.format(*fields)) +Hyde Park Corner, Hyde Park 203592 +Belgrove Street , King's Cross 168110 +Waterloo Station 3, Waterloo 148809 +Albert Gate, Hyde Park 145794 +Black Lion Gate, Kensington Gardens 137930 +Waterloo Station 1, Waterloo 106092 +Wellington Arch, Hyde Park 102770 +Triangle Car Park, Hyde Park 99368 +Wormwood Street, Liverpool Street 82483 +Palace Gate, Kensington Gardens 80342 +#!/bin/bash + +IMAGE=--image-family=tf-latest-cpu +INSTANCE_NAME=dlvm +MAIL=google-cloud-customer@gmail.com # CHANGE THIS + +echo "Launching $INSTANCE_NAME" +gcloud compute instances create ${INSTANCE_NAME} \ + --machine-type=n1-standard-2 \ + --scopes=https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/userinfo.email \ + ${IMAGE} \ + --image-project=deeplearning-platform-release \ + --boot-disk-device-name=${INSTANCE_NAME} \ + --metadata="proxy-user-mail=${MAIL}" + +echo "Looking for Jupyter URL on $INSTANCE_NAME" +while true; do + proxy=$(gcloud compute instances describe ${INSTANCE_NAME} 2> /dev/null | grep dot-datalab-vm) + if [ -z "$proxy" ] + then + echo -n "." + sleep 1 + else + echo "done!" + echo "$proxy" + break + fi +done +!pip install google-cloud-bigquery +%load_ext google.cloud.bigquery +%%bigquery%%bigquery --project $PROJECT +SELECT + start_station_name + , AVG(duration) as duration + , COUNT(duration) as num_trips +FROM `bigquery-public-data`.london_bicycles.cycle_hire +GROUP BY start_station_name +ORDER BY num_trips DESC +LIMIT 5 +%%bigquerydf%%bigquery df --project $PROJECT +SELECT + start_station_name + , AVG(duration) as duration + , COUNT(duration) as num_trips +FROM `bigquery-public-data`.london_bicycles.cycle_hire +GROUP BY start_station_name +ORDER BY num_trips DESC +dfdf.describe() +badtrips%%bigquery badtrips --project $PROJECT + +WITH all_bad_trips AS ( +SELECT + start_station_name + , COUNTIF(duration < 600 AND start_station_name = end_station_name) AS bad_trips + , COUNT(*) as num_trips +FROM `bigquery-public-data`.london_bicycles.cycle_hire +WHERE EXTRACT(YEAR FROM start_date) = 2015 +GROUP BY start_station_name +HAVING num_trips > 10 +) +SELECT *, bad_trips / num_trips AS fraction_bad FROM all_bad_trips +ORDER BY fraction_bad DESC +badtrips.describe() + +fraction_badbadtrips.plot.scatter('num_trips', 'fraction_bad'); + +fraction_badnum_tripsfraction_badnum_tripsseabornimport seaborn as sns +ax = sns.regplot(badtrips['num_trips'],badtrips['fraction_bad']); +ax.set_ylim(0, 0.05); +fraction_badnum_tripsfraction_badnum_tripsfraction_badnum_tripsstations_to_examine = [] +for band in range(1,5): + min_trips = badtrips['num_trips'].quantile(0.2*(band)) + max_trips = badtrips['num_trips'].quantile(0.2*(band+1)) + query = 'num_trips >= {} and num_trips < {}'.format( + min_trips, max_trips) + print(query) # band + stations = badtrips.query(query) + stations = stations.sort_values( + by=['fraction_bad'], ascending=False)[:5] + print(stations) # 5 worst + stations_to_examine.append(stations) + print() +num_trips >= 4826.4 and num_trips < 8511.8 + start_station_name bad_trips num_trips fraction_bad +6 River Street , Clerkenwell 221 8279 0.026694 +9 Courland Grove, Wandsworth Road 105 5369 0.019557 +10 Stanley Grove, Battersea 92 4882 0.018845 +12 Southern Grove, Bow 112 6152 0.018205 +18 Richmond Way, Shepherd's Bush 126 8149 0.015462 +num_trips >= 16509.2 and num_trips < 95740.0 + start_station_name bad_trips num_trips fraction_bad +25 Queen's Gate, Kensington Gardens 396 27457 0.014423 +74 Speakers' Corner 2, Hyde Park 468 41107 0.011385 +76 Cumberland Gate, Hyde Park 303 26981 0.011230 +77 Albert Gate, Hyde Park 729 66547 0.010955 +82 Triangle Car Park, Hyde Park 454 41675 0.010894 +stations_to_examine = pd.concat(stations_to_examine) +bq = bigquery.Client(project=PROJECT) +tblref = TableReference.from_string( + '{}.ch05eu.bad_bikes'.format(PROJECT)) +job = bq.load_table_from_dataframe(stations_to_examine, tblref) +job.result() # blocks and waits +cycle_stations%%bigquery stations_to_examine --project $PROJECT +SELECT + start_station_name AS station_name + , num_trips + , fraction_bad + , latitude + , longitude +FROM ch05eu.bad_bikes AS bad +JOIN `bigquery-public-data`.london_bicycles.cycle_stations AS s +ON bad.start_station_name = s.name +foliumimport folium +map_pts = folium.Map(location=[51.5, -0.15], zoom_start=12) +for idx, row in stations_to_examine.iterrows(): + folium.Marker( location=[row['latitude'], row['longitude']], + popup=row['station_name'] ).add_to(map_pts) +install.packages("bigrquery", dependencies=TRUE) +billing <- 'cloud-training-demos' # your project name +sql <- " +SELECT + start_station_name + , AVG(duration) as duration + , COUNT(duration) as num_trips + FROM `bigquery-public-data`.london_bicycles.cycle_hire + GROUP BY start_station_name + ORDER BY num_trips DESC + LIMIT 5 +" +tbl <- bq_project_query(billing, sql) +bq_table_download(tbl, max_results=100) +grid.tbl(tbl) +bq_project_querybq_table_download!conda install rpy2 +%load_ext rpy2.ipython +%%bigquery docks --project $PROJECT +SELECT + docks_count, latitude, longitude +FROM `bigquery-public-data`.london_bicycles.cycle_stations +WHERE bikes_count > 0 +lmdocks%%R -i docks +mod <- lm(docks ~ latitude + longitude) +summary(mod) +durationscipyfrom scipy import stats +ag,bg,cg = stats.gamma.fit(df['duration']) + opts = beam.pipeline.PipelineOptions(flags = [], **options) + RUNNER = 'DataflowRunner' + query = """ + SELECT start_station_id, ARRAY_AGG(duration) AS duration_array + FROM `bigquery-public-data.london_bicycles.cycle_hire` + GROUP BY start_station_id + """ + + with beam.Pipeline(RUNNER, options = opts) as p: + (p + | 'read_bq' >> beam.io.Read(beam.io.BigQuerySource(query=query)) + | 'compute_fit' >> beam.Map(compute_fit) + | 'write_bq' >> beam.io.gcp.bigquery.WriteToBigQuery( + 'ch05eu.station_stats', schema='station_id:string,ag:FLOAT64,bg:FLOAT64,cg:FLOAT64') + ) +compute_fitdef compute_fit(row): + from scipy import stats + result = {} + result['station_id'] = row['start_station_id'] + durations = row['duration_array'] + ag, bg, cg = stats.gamma.fit(durations) + result['ag'] = ag + result['bg'] = bg + result['cg'] = cg + return result + +PROJECT_IDcreateBigQueryPresentationfunction createBigQueryPresentation() { + var spreadsheet = runQuery(); + Logger.log('Results spreadsheet created: %s', spreadsheet.getUrl()); + var chart = createColumnChart(spreadsheet); // UPDATED + var deck = createSlidePresentation(spreadsheet, chart); // NEW + Logger.log('Results slide deck created: %s', deck.getUrl()); // NEW +} +runQuerycreateColumnChartcreateSlidePresentationvar queryResults = BigQuery.Jobs.query(request, PROJECT_ID); +var rows = queryResults.rows; + while (queryResults.pageToken) { + queryResults = BigQuery.Jobs.getQueryResults(PROJECT_ID, jobId, { + pageToken: queryResults.pageToken + }); + rows = rows.concat(queryResults.rows); + } +bqbqbqbq mk bq mk --location=US \ + --default_table_expiration 3600 \ + --description "Chapter 5 of BigQuery Book." \ + ch05 +bq mk#!/bin/bash +bq_safe_mk() { + dataset=$1 + exists=$(bq ls --dataset | grep -w $dataset) + if [ -n "$exists" ]; then + echo "Not creating $dataset since it already exists" + else + echo "Creating $dataset" + bq mk $dataset + fi +} +# this is how you call the function +bq_safe_mk ch05 +ch05gcloud authbq mk --location=US \ + --default_table_expiration 3600 \ + --description "Chapter 5 of BigQuery Book." \ + projectname:ch05 +bq mkch05.rentals_last_hourrental_iddurationbq mk --table \ + --expiration 3600 \ + --description "One hour of data" \ + --label persistence:volatile \ + ch05.rentals_last_hour rental_id:STRING,duration:FLOAT +persistencevolatilebq mk --table \ + --expiration 3600 \ + --description "One hour of data" \ + --label persistence:volatile \ + ch05.rentals_last_hour schema.json +bq cp ch04.old_table ch05.new_table +bq waitbq wait --fail_on_error job_id +bq wait --fail_on_error job_id 600job_idbq loadbq insertbq insert ch05.rentals_last_hour data.json +{"rental_id":"345ce4", "duration":240} +bq extractbq extract --format=json ch05.bad_bikes gs://bad_bikes.jsonbq querybq query \ + --use_legacy_sql=false \ + 'SELECT MAX(duration) FROM `bigquery-public-data`.london_bicycles.cycle_hire' +echo "SELECT MAX(duration) FROM `bigquery-public-data`.london_bicycles.cycle_hire" \ +| bq query --use_legacy_sql=false +#!/bin/bash +read -d '' QUERY_TEXT << EOF +SELECT + start_station_name + , AVG(duration) as duration + , COUNT(duration) as num_trips +FROM \`bigquery-public-data\`.london_bicycles.cycle_hire +GROUP BY start_station_name +ORDER BY num_trips DESC +LIMIT 5 +EOF +bq query --project_id=some_project --use_legacy_sql=false $QUERY_TEXT +bq query--use_legacy_sql=falsebq.bigqueryrc + --location$BIGQUERYRC/.bigqueryrc$HOME/.bigqueryrc$BIGQUERYRC.bigqueryrc--location=EU +--project_id=some_project +[mk] +--expiration=3600 +[query] +--use_legacy_sql=false +--location=EU some_projectbq mk--expiration=3600bq query--use_legacy_sql=false--expirationbq headSELECT *LIMITbq head -n 10 ch05.bad_bikes + +bq head -s 10 -n 10 ch05.bad_bikes +bq mkrental_durationch05#!/bin/bash +read -d '' QUERY_TEXT << EOF +SELECT + start_station_name + , duration/60 AS duration_minutes +FROM \`bigquery-public-data\`.london_bicycles.cycle_hire +EOF +bq mk --view=$QUERY_TEXT ch05.rental_duration +--view--materialized_viewbq ls --datasetbq ls ch05 +ch05 +bq ls -p +bq ls -j some_project +bq ls --dataset +bq ls --dataset some_project +bq ls --models +bq ls --transfer_run \ + --filter='states:PENDING' \ + --run_attempt='LATEST' \ + projects/p/locations/l/transferConfigs/c +bq ls --reservation_grant \ + --project_id=some_proj \ + --location='us' +bq showbq show ch05 +ch05 +bq show -j some_job_id +bq show --schema ch05.bad_bikes +ch05.bad_bikesbq show --view ch05.some_view +bq show --materialized_view ch05.some_view +bq show --model ch05.some_model +bq show --transfer_run \ + projects/p/locations/l/transferConfigs/c/runs/r +bq updatebq update --description "Bikes that need repair" ch05.bad_bikes +bqupdatebq update \ + --view "SELECT ..."\ + ch05.rental_durationbq update --reservation --location=US \ + --project_id=some_project \ + --reservation_size=2000000000 +bq diff --git a/06_arch/all_code.txt b/06_arch/all_code.txt new file mode 100644 index 0000000..a723f3e --- /dev/null +++ b/06_arch/all_code.txt @@ -0,0 +1,90 @@ +SELECT 17curlPOST /bigquery/v2/projects/bigquery-e2e/jobs HTTP/1.1 +User-Agent: curl/7.30.0 +Host: www.googleapis.com +Accept: */* +Authorization: Bearer +Content-Type: application/json +Content-Length: 126 +{'configuration': {'query': {'query': 'SELECT 17'}}} +POSTSELECT 17bq.pySELECTSELECT + COUNT(*) + , start_station_name +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +GROUP BY 2 +ORDER BY 1 DESC +LIMIT 10 + +SELECT + COUNT(*) + , starts.start_station_id as point_a + , ends.start_station_id as point_b +FROM + `bigquery-public-data`.london_bicycles.cycle_hire starts, + `bigquery-public-data`.london_bicycles.cycle_hire ends +WHERE + starts.start_station_id = ends.end_station_id + AND ends.start_station_id = starts.end_station_id + AND starts.start_station_id <> ends.start_station_id + AND starts.start_date = ends.start_date +GROUP BY 2, 3 +ORDER BY 1 DESC +LIMIT 10 + +SELECT COUNT(*) as c +FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2017` +WHERE passenger_count > 5 +bqbq --format=prettyjson show -j \ + | grep completedParallelInputs + + + "completedParallelInputs": "9", + "completedParallelInputs": "1", +COUNT_STAR() SELECT title, COUNT(title) as c +FROM `bigquery-samples.wikipedia_benchmark.Wiki1B` +WHERE title LIKE "G%o%o%g%l%e" +GROUP BY title +ORDER BY c DESC +SELECT title, COUNT(title) as c +FROM `bigquery-samples.wikipedia_benchmark.Wiki1B` +GROUP BY title +ORDER BY c DESC +WITH +repo_commits AS ( + SELECT repos AS repo_name, author.name AS author + FROM `bigquery-public-data.github_repos.commits` c, c.repo_name repos + WHERE author.name IN ("Valliappa Lakshmanan", "Jordan Tigani") + GROUP BY repos, author), +repo_languages AS ( + SELECT lang.name AS lang, lang.bytes AS lang_bytes, repos.repo_name AS repo_name + FROM `bigquery-public-data.github_repos.languages` repos, repos.LANGUAGE AS lang ) + +SELECT lang, author, SUM(lang_bytes) AS total_bytes +FROM repo_languages +JOIN repo_commits USING (repo_name) +GROUP BY lang, author +ORDER BY total_bytes DESC +SELECT lang, author, SUM(lang_bytes) AS total_bytes +FROM repo_languages +JOIN repo_commits USING (repo_name) +GROUP BY lang, author +ORDER BY total_bytes DESC +INNER JOIN EACH WITH ALLALLWITH +repo_commits AS ( + SELECT repos AS repo_name, author.name AS author + FROM `bigquery-public-data.github_repos.commits` c, c.repo_name repos + -- WHERE author.name IN ("Valliappa Lakshmanan", "Jordan Tigani") + GROUP BY repos, author), +repo_languages AS ( + SELECT lang.name AS lang, lang.bytes AS lang_bytes, repos.repo_name AS repo_name + FROM `bigquery-public-data.github_repos.languages` repos, repos.LANGUAGE AS lang ) + +SELECT lang, author, SUM(lang_bytes) AS total_bytes +FROM repo_languages +JOIN repo_commits USING (repo_name) +GROUP BY lang, author +ORDER BY total_bytes DESC +LIMIT 100 +INNER JOIN EACH WITH EACHEACH WITH ALLHASHPENDINGCOMMITTEDGARBAGEPENDINGCOMMITTEDGARBAGEGARBAGECOMMITTEDtables.delete()WHERE eventDate >= '20170102'2017010220170103customer_id customer_idcustomer_idSELECT … WHERE customer_id = 275customer_idcustomer_idcustomer_idSELECT orders.order_id FROM retail.orders AS orders JOIN retail.customers ON orders.customer_id = customers.customer_id +WHERE customers.customer_name = 'Jordan Tigani' +customer_idcustomer_idcustomer_idorder_idINSERTUPDATEDELETEMERGEINSERTINSERTDELETEDELETE … WHERE customer_id = 1234GARBAGEUPDATEINSERTDELETEMERGEUPDATEUPDATEMERGEDELETEUPDATEMERGEWHERE field1 = 30 diff --git a/07_perf/all_code.txt b/07_perf/all_code.txt new file mode 100644 index 0000000..c694b73 --- /dev/null +++ b/07_perf/all_code.txt @@ -0,0 +1,877 @@ +bq--dry_run--maximum_bytes_billedINFORMATION_SCHEMASELECT + job_id + , query + , user_email + , total_bytes_processed + , total_slot_ms +FROM `some-project`.INFORMATION_SCHEMA.JOBS_BY_PROJECT +WHERE EXTRACT(YEAR FROM creation_time) = 2019 +ORDER BY total_bytes_processed DESC +LIMIT 5 +total_bytes_processedtotal_slot_mstimecurlbashread -d '' QUERY_TEXT << EOF +SELECT + start_station_name + , AVG(duration) as duration + , COUNT(duration) as num_trips +FROM \`bigquery-public-data\`.london_bicycles.cycle_hire +GROUP BY start_station_name +ORDER BY num_trips DESC +LIMIT 5 +EOF + +read -d '' request << EOF +{ + "useLegacySql": false, + "useQueryCache": false, + "query": \"${QUERY_TEXT}\" +} +EOF +request=$(echo "$request" | tr '\n' ' ') + +gcloudaccess_token=$(gcloud auth application-default print-access-token) +PROJECT=$(gcloud config get-value project) + +NUM_TIMES=10 +time for i in $(seq 1 $NUM_TIMES); do +echo -en "\r ... $i / $NUM_NUMTIMES ..." +curl --silent \ + -H "Authorization: Bearer $access_token" \ + -H "Content-Type: application/json" \ + -X POST \ + -d "$request" \ + "https://www.googleapis.com/bigquery/v2/projects/$PROJECT/queries" > /dev/null +done +real 0m16.875s +user 0m0.265s +sys 0m0.109s +read -d '' request << EOF +{ + "useLegacySql": false, + "useQueryCache": true, + "query": \"${QUERY_TEXT}\" +} +EOF +real 0m6.760s +user 0m0.264s +sys 0m0.114s +sudo apt-get -y install gradle +brew install gradle +git clone https://github.com/GoogleCloudPlatform/pontem.git +cd pontem/BigQueryWorkloadTester +gradle clean :BigQueryWorkloadTester:build +cat < queries/busystations.sql +SELECT + start_station_name + , AVG(duration) as duration + , COUNT(duration) as num_trips +FROM \`bigquery-public-data\`.london_bicycles.cycle_hire +GROUP BY start_station_name +ORDER BY num_trips DESC +LIMIT 5 +EOF +cat <./config.yaml +concurrencyLevel: 1 +isRatioBasedBenchmark: true +benchmarkRatios: [1.0, 2.0] +outputFileFolder: $OUTDIR +workloads: +- name: "Busy stations" + projectId: $PROJECT + queryFiles: + - queries/busystations.sql + outputFileName: busystations.json +EOF +concurrencyLevel: 10 +isRatioBasedBenchmark: true +benchmarkRatios: [0.1, 0.25, 0.5, 1.0, 1.5, 2.0]gradle clean :BigQueryWorkloadTester:run +ch05Warningch05euch05eucycle_stations_copych05eubad_bikesbad_bikescycle_stations_copycycle_stations_copybad_bikesbad_bikesSELECT + start_station_name, + AVG(duration) AS duration, + COUNT(duration) AS num_trips +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +GROUP BY + start_station_name +ORDER BY + num_trips DESC +LIMIT + 5 +JOBID=8adbf3fd-e310-44bb-9c6e-88254958ccac # CHANGE +access_token=$(gcloud auth application-default print-access-token) +PROJECT=$(gcloud config get-value project) +curl --silent \ + -H "Authorization: Bearer $access_token" \ + -X GET \ + "https://www.googleapis.com/bigquery/v2/projects/$PROJECT/jobs/$JOBID" +"waitRatioAvg": 0.058558558558558557, +"readRatioAvg": 0.070270270270270274, +"computeRatioAvg": 0.86036036036036034 +... +"shuffleOutputBytes": "356596", +"shuffleOutputBytesSpilled": "0", +"recordsRead": "24369201", +"recordsWritten": "6138", +"parallelInputs": "7", +durationstart_station_time, $1$2start_station_timedurationSELECTSELECT * bike_idSELECT + bike_id + , duration +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +ORDER BY duration DESC +LIMIT 1 +SELECT + * +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +ORDER BY duration DESC +LIMIT 1 +LIMITSELECT * LIMITSELECT * SELECT * EXCEPTSELECT + MIN(start_station_name) AS start_station_name + , MIN(end_station_name) AS end_station_name + , APPROX_QUANTILES(duration, 10)[OFFSET(5)] AS typical_duration + , COUNT(duration) AS num_trips +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +WHERE + start_station_id != end_station_id +GROUP BY + start_station_id, end_station_id +ORDER BY num_trips DESC +LIMIT 10 +SELECT + start_station_name + , end_station_name + , APPROX_QUANTILES(duration, 10)[OFFSET(5)] AS typical_duration + , COUNT(duration) AS num_trips +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +WHERE + start_station_name != end_station_name +GROUP BY + start_station_name, end_station_name +ORDER BY num_trips DESC +LIMIT 10 +WITH trip_distance AS ( + SELECT + bike_id + , ST_Distance(ST_GeogPoint(s.longitude, s.latitude), + ST_GeogPoint(e.longitude, e.latitude)) AS distance + FROM + `bigquery-public-data`.london_bicycles.cycle_hire, + `bigquery-public-data`.london_bicycles.cycle_stations s, + `bigquery-public-data`.london_bicycles.cycle_stations e + WHERE + start_station_id = s.id + AND end_station_id = e.id +) + +SELECT + bike_id + , SUM(distance)/1000 AS total_distance +FROM trip_distance +GROUP BY bike_id +ORDER BY total_distance DESC +LIMIT 5 +WITH stations AS ( + SELECT + s.id AS start_id + , e.id AS end_id + , ST_Distance(ST_GeogPoint(s.longitude, s.latitude), + ST_GeogPoint(e.longitude, e.latitude)) AS distance + FROM + `bigquery-public-data`.london_bicycles.cycle_stations s, + `bigquery-public-data`.london_bicycles.cycle_stations e +), + +trip_distance AS ( + SELECT + bike_id + , distance + FROM + `bigquery-public-data`.london_bicycles.cycle_hire, + stations + WHERE + start_station_id = start_id + AND end_station_id = end_id +) + +SELECT + bike_id + , SUM(distance)/1000 AS total_distance +FROM trip_distance +GROUP BY bike_id +ORDER BY total_distance DESC +LIMIT 5 +CURRENT_TIMESTAMPRANDWITH typical_trip AS ( +SELECT + start_station_name + , end_station_name + , APPROX_QUANTILES(duration, 10)[OFFSET(5)] AS typical_duration + , COUNT(duration) AS num_trips +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +GROUP BY + start_station_name, end_station_name +) +CREATE OR REPLACE TABLE ch07eu.typical_trip AS +SELECT + start_station_name + , end_station_name + , APPROX_QUANTILES(duration, 10)[OFFSET(5)] AS typical_duration + , COUNT(duration) AS num_trips +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +GROUP BY + start_station_name, end_station_name +SELECT + EXTRACT (DATE FROM start_date) AS trip_date + , APPROX_QUANTILES(duration / typical_duration, 10)[OFFSET(5)] AS ratio + , COUNT(*) AS num_trips_on_day +FROM + `bigquery-public-data`.london_bicycles.cycle_hire AS hire +JOIN typical_trip AS trip +ON + hire.start_station_name = trip.start_station_name + AND hire.end_station_name = trip.end_station_name + AND num_trips > 10 +GROUP BY trip_date +HAVING num_trips_on_day > 10 +ORDER BY ratio DESC +LIMIT 10 +SELECT + EXTRACT (DATE FROM start_date) AS trip_date + , APPROX_QUANTILES(duration / typical_duration, 10)[OFFSET(5)] AS ratio + , COUNT(*) AS num_trips_on_day +FROM + `bigquery-public-data`.london_bicycles.cycle_hire AS hire +JOIN ch07eu.typical_trip AS trip +ON + hire.start_station_name = trip.start_station_name + AND hire.end_station_name = trip.end_station_name + AND num_trips > 10 +GROUP BY trip_date +HAVING num_trips_on_day > 10 +ORDER BY ratio DESC +LIMIT 10 +ch07eu.typical_tripcycle_hireWITH trip AS ( +SELECT + REGEXP_REPLACE(start_station_name, + r"^# ([a-zA-Z0-9\s]+$)", "FROM: \\1") AS start_station_name + , REGEXP_REPLACE(end_station_name, + r"^# ([a-zA-Z0-9\s]+$)", "TO: \\1") ASend_station_name + , duration +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +) +SELECT * FROM trip +WHERE duration > 84000 +tripCREATE OR REPLACE TABLE ch07eu.london_bicycles_denorm AS +SELECT + start_station_id + , s.latitude AS start_latitude + , s.longitude AS start_longitude + , end_station_id + , e.latitude AS end_latitude + , e.longitude AS end_longitude +FROM + `bigquery-public-data`.london_bicycles.cycle_hire as h +JOIN + `bigquery-public-data`.london_bicycles.cycle_stations as s +ON + h.start_station_id = s.id +JOIN + `bigquery-public-data`.london_bicycles.cycle_stations as e +ON + h.end_station_id = e.id + +SELECT + name + , number AS num_babies +FROM `bigquery-public-data`.usa_names.usa_1910_current +WHERE gender = 'M' AND year = 2015 AND state = 'MA' +ORDER BY num_babies DESC +LIMIT 5 +WITH male_babies AS ( +SELECT + name + , number AS num_babies +FROM `bigquery-public-data`.usa_names.usa_1910_current +WHERE gender = 'M' +), +female_babies AS ( +SELECT + name + , number AS num_babies +FROM `bigquery-public-data`.usa_names.usa_1910_current +WHERE gender = 'F' +), +both_genders AS ( +SELECT + name + , SUM(m.num_babies) + SUM(f.num_babies) AS num_babies + , SUM(m.num_babies) / (SUM(m.num_babies) + SUM(f.num_babies)) AS frac_male +FROM male_babies AS m +JOIN female_babies AS f +USING (name) +GROUP BY name +) + +SELECT * FROM both_genders +WHERE frac_male BETWEEN 0.3 and 0.7 +ORDER BY num_babies DESC +LIMIT 5 +WITH all_babies AS ( +SELECT + name + , SUM(IF(gender = 'M', number, 0)) AS male_babies + , SUM(IF(gender = 'F', number, 0)) AS female_babies +FROM `bigquery-public-data.usa_names.usa_1910_current` +GROUP BY name +), + +both_genders AS ( +SELECT + name + , (male_babies + female_babies) AS num_babies + , SAFE_DIVIDE(male_babies, male_babies + female_babies) AS frac_male +FROM all_babies +WHERE male_babies > 0 AND female_babies > 0 +) + +SELECT * FROM both_genders +WHERE frac_male BETWEEN 0.3 and 0.7 +ORDER BY num_babies desc +limit 5 +with all_names AS ( + SELECT name, gender, SUM(number) AS num_babies + FROM `bigquery-public-data`.usa_names.usa_1910_current + GROUP BY name, gender +), + +male_names AS ( + SELECT name, num_babies + FROM all_names + WHERE gender = 'M' +), + +female_names AS ( + SELECT name, num_babies + FROM all_names + WHERE gender = 'F' +), + +ratio AS ( + SELECT + name + , (f.num_babies + m.num_babies) AS num_babies + , m.num_babies / (f.num_babies + m.num_babies) AS frac_male + FROM male_names AS m + JOIN female_names AS f + USING (name) +) + +SELECT * from ratio +WHERE frac_male BETWEEN 0.3 and 0.7 +ORDER BY num_babies DESC +LIMIT 5 +end_datestart_dateSELECT + bike_id + , start_date + , end_date + , TIMESTAMP_DIFF( + start_date, + LAG(end_date) OVER (PARTITION BY bike_id ORDER BY start_date), + SECOND) AS time_at_station +FROM `bigquery-public-data`.london_bicycles.cycle_hire +LIMIT 5 +time_at_stationtime_at_stationWITH unused AS ( +SELECT + bike_id + , start_station_name + , start_date + , end_date + , TIMESTAMP_DIFF(start_date, LAG(end_date) OVER (PARTITION BY bike_id ORDER BY start_date), SECOND) AS time_at_station +FROM `bigquery-public-data`.london_bicycles.cycle_hire +) + +SELECT + start_station_name + , AVG(time_at_station) AS unused_seconds +FROM unused +GROUP BY start_station_name +ORDER BY unused_seconds ASC +LIMIT 5 +with denormalized_table AS ( + SELECT + start_station_name + , end_station_name + , ST_DISTANCE(ST_GeogPoint(s1.longitude, s1.latitude), + ST_GeogPoint(s2.longitude, s2.latitude)) AS distance + , duration + FROM + `bigquery-public-data`.london_bicycles.cycle_hire AS h + JOIN + `bigquery-public-data`.london_bicycles.cycle_stations AS s1 + ON h.start_station_id = s1.id + JOIN + `bigquery-public-data`.london_bicycles.cycle_stations AS s2 + ON h.end_station_id = s2.id +), + +durations AS ( + SELECT + start_station_name + , end_station_name + , MIN(distance) AS distance + , AVG(duration) AS duration + , COUNT(*) AS num_rides + FROM + denormalized_table + WHERE + duration > 0 AND distance > 0 + GROUP BY start_station_name, end_station_name + HAVING num_rides > 100 +) + +SELECT + start_station_name + , end_station_name + , distance + , duration + , duration/distance AS pace +FROM durations +ORDER BY pace ASC +LIMIT 5 +ST_DISTANCEcycle_hirecycle_stationswith distances AS ( + SELECT + a.id AS start_station_id + , a.name AS start_station_name + , b.id AS end_station_id + , b.name AS end_station_name + , ST_DISTANCE(ST_GeogPoint(a.longitude, a.latitude), + ST_GeogPoint(b.longitude, b.latitude)) AS distance + FROM + `bigquery-public-data`.london_bicycles.cycle_stations a + CROSS JOIN + `bigquery-public-data`.london_bicycles.cycle_stations b + WHERE a.id != b.id +), + +durations AS ( + SELECT + start_station_id + , end_station_id + , AVG(duration) AS duration + , COUNT(*) AS num_rides + FROM + `bigquery-public-data`.london_bicycles.cycle_hire + WHERE + duration > 0 + GROUP BY start_station_id, end_station_id + HAVING num_rides > 100 +) + +SELECT + start_station_name + , end_station_name + , distance + , duration + , duration/distance AS pace +FROM distances +JOIN durations +USING (start_station_id, end_station_id) +ORDER BY pace ASC +LIMIT 5 + CREATE OR REPLACE TABLE ch07eu.cycle_hire AS + SELECT + start_station_name + , end_station_name + , ST_DISTANCE(ST_GeogPoint(s1.longitude, s1.latitude), + ST_GeogPoint(s2.longitude, s2.latitude)) AS distance + , duration + FROM + `bigquery-public-data`.london_bicycles.cycle_hire AS h + JOIN + `bigquery-public-data`.london_bicycles.cycle_stations AS s1 + ON h.start_station_id = s1.id + JOIN + `bigquery-public-data`.london_bicycles.cycle_stations AS s2 + ON h.end_station_id = s2.id +SELECT + rental_id + , ROW_NUMBER() OVER(ORDER BY end_date) AS rental_number +FROM `bigquery-public-data.london_bicycles.cycle_hire` +ORDER BY rental_number ASC +LIMIT 5 +WITH rentals_on_day AS ( +SELECT + rental_id + , end_date + , EXTRACT(DATE FROM end_date) AS rental_date +FROM `bigquery-public-data.london_bicycles.cycle_hire` +) + +SELECT + rental_id + , rental_date + , ROW_NUMBER() OVER(PARTITION BY rental_date ORDER BY end_date) AS rental_number_on_day +FROM rentals_on_day +ORDER BY rental_date ASC, rental_number_on_day ASC +LIMIT 5 + +ARRAY_AGGGROUP BYSELECT + repo_name + , ARRAY_AGG(STRUCT(author, committer, subject, message, trailer, difference, encoding) ORDER BY author.date.seconds) +FROM `bigquery-public-data.github_repos.commits`, UNNEST(repo_name) AS repo_name +GROUP BY repo_name +SELECT + author.tz_offset, ARRAY_AGG(STRUCT(author, committer, subject, message, trailer, difference, encoding) ORDER BY author.date.seconds) +FROM `bigquery-public-data.github_repos.commits` +GROUP BY author.tz_offset +SELECT + author.tz_offset, ARRAY_AGG(STRUCT(author, committer, subject, message, trailer, difference, encoding) ORDER BY author.date.seconds LIMIT 1000) +FROM `bigquery-public-data.github_repos.commits` +GROUP BY author.tz_offset +SELECT + repo_name, author.tz_offset + , ARRAY_AGG(STRUCT(author, committer, subject, message, trailer, difference, encoding) ORDER BY author.date.seconds) +FROM `bigquery-public-data.github_repos.commits`, UNNEST(repo_name) AS repo_name +GROUP BY repo_name, author.tz_offsetSELECT + COUNT(DISTINCT repo_name) AS num_repos +FROM `bigquery-public-data`.github_repos.commits, UNNEST(repo_name) AS repo_name +SELECT + APPROX_COUNT_DISTINCT(repo_name) AS num_repos +FROM `bigquery-public-data`.github_repos.commits, UNNEST(repo_name) AS repo_name +SELECT + COUNT(DISTINCT bike_id) AS num_bikes +FROM `bigquery-public-data`.london_bicycles.cycle_hire +SELECT + APPROX_COUNT_DISTINCT(bike_id) AS num_bikes +FROM `bigquery-public-data`.london_bicycles.cycle_hire +APPROX_QUANTILESAPPROX_TOP_COUNTAPPROX_TOP_SUMAPPROX_TOP_COUNTSELECT + APPROX_TOP_COUNT(bike_id, 5) AS num_bikes +FROM `bigquery-public-data`.london_bicycles.cycle_hire +APPROX_TOP_COUNT LIMITAPPROX_TOP_SUMSELECT + APPROX_TOP_SUM(start_station_name, duration, 5) AS num_bikes +FROM `bigquery-public-data`.london_bicycles.cycle_hire +WHERE duration > 0 +APPROX_*HLL_COUNT.INIT +HLL_COUNT.EXTRACT +HLL_COUNT.MERGE_PARTIAL +HLL_COUNT.MERGEWITH sketch AS ( +SELECT + HLL_COUNT.INIT(start_station_name) AS hll_start + , HLL_COUNT.INIT(end_station_name) AS hll_end +FROM `bigquery-public-data`.london_bicycles.cycle_hire +) + +SELECT + HLL_COUNT.MERGE(hll_start) AS distinct_start + , HLL_COUNT.MERGE(hll_end) AS distinct_end + , HLL_COUNT.MERGE(hll_both) AS distinct_station +FROM sketch, UNNEST([hll_start, hll_end]) AS hll_both +APPROX_COUNT_DISTINCTSELECT + APPROX_COUNT_DISTINCT(start_station_name) AS distinct_start + , APPROX_COUNT_DISTINCT(end_station_name) AS distinct_end + , APPROX_COUNT_DISTINCT(both_stations) AS distinct_station +FROM + `bigquery-public-data`.london_bicycles.cycle_hire + , UNNEST([start_station_name, end_station_name]) AS both_stations +APPROX_user_id, date, product, country +user_idHLL_COUNT.INITINSERT INTO approx_distinct_users_agg AS +SELECT date, product, country, HLL_COUNT.INIT(user_id) AS sketch +GROUP BY date, product, country, sketch +SELECT date, HLL_COUNT.MERGE(sketch) +FROM approx_distinct_users_agg +GROUP BY dateAccept-Encoding: gzip +User-Agent: programName (gzip) +JOBSURL="https://www.googleapis.com/bigquery/v2/projects/$PROJECT/jobs" +FIELDS="statistics(query(queryPlan(steps)))" +curl --silent \ + -H "Authorization: Bearer $access_token" \ + -H "Accept-Encoding: gzip" \ + -H "User-Agent: get_job_details (gzip)" \ + -X GET \ + "${JOBSURL}/${JOBID}?fields=${FIELDS}" \ + | zcat +multipart/mixedmultipart/mixed# The 5 most recent successful jobs +JOBS=$(bq ls -j -n 50 | grep SUCCESS | head -5 | awk '{print $1}') + +BATCHURL="https://www.googleapis.com/batch/bigquery/v2" +JOBSPATH="/projects/$PROJECT/jobs" +FIELDS="statistics(query(queryPlan(steps)))" + +request="" +for JOBID in $JOBS; do +read -d '' part << EOF + +--batch_part_starts_here +GET ${JOBSPATH}/${JOBID}?fields=${FIELDS} + +EOF +request=$(echo "$request"; echo "$part") +done + +curl --silent \ + -H "Authorization: Bearer $access_token" \ + -H "Content-Type: multipart/mixed; boundary=batch_part_starts_here" \ + -X POST \ + -d "$request" \ + "${BATCHURL}" +%%bigquery%pip install google-cloud-bigquery-storage[fastavro,pandas] +%%bigquery%%bigquery df --use_bqstorage_api --project $PROJECT +SELECT + start_station_name + , end_station_name + , start_date + , duration +FROM `bigquery-public-data`.london_bicycles.cycle_hire +--use_bqstorage_apiimport google.cloud.bigquery.magics +google.cloud.bigquery.magics.context.use_bqstorage_api = Truegsutil lifecycle set lifecycle.yaml gs://some_bucket/ +{ +"lifecycle": { + "rule": [ + { + "action": { + "type": "SetStorageClass", + "storageClass": "NEARLINE" + }, + "condition": { + "age": 30, + "matchesStorageClass": ["MULTI_REGIONAL", "STANDARD"] + } + }, + { + "action": { + "type": "SetStorageClass", + "storageClass": "COLDLINE" + }, + "condition": { + "age": 90, + "matchesStorageClass": ["NEARLINE"] + } + } +]}} +SELECT + sid, number, basin, name, + ARRAY_AGG(STRUCT(iso_time, usa_latitude, usa_longitude, usa_wind) ORDER BY usa_wind DESC LIMIT 1)[OFFSET(0)].* +FROM + `bigquery-public-data`.noaa_hurricanes.hurricanes +WHERE + season = '2018' +GROUP BY + sid, number, basin, name +ORDER BY number ASC +CREATE OR REPLACE TABLE ch07.hurricanes_nested AS + +SELECT sid, season, number, basin, name, iso_time, nature, usa_sshs, + STRUCT(usa_latitude AS latitude, usa_longitude AS longitude, usa_wind AS wind, usa_pressure AS pressure) AS usa, + STRUCT(tokyo_latitude AS latitude, tokyo_longitude AS longitude, tokyo_wind AS wind, tokyo_pressure AS pressure) AS tokyo, + ... AS cma, + ... AS hko, + ... AS newdelhi, + ... AS reunion, + ... bom, + ... AS wellington, + ... nadi +FROM `bigquery-public-data`.noaa_hurricanes.hurricanes +usa.latitudeusa_latitudeSELECT + sid, number, basin, name, + ARRAY_AGG(STRUCT(iso_time, usa.latitude, usa.longitude, usa.wind) ORDER BY usa.wind DESC LIMIT 1)[OFFSET(0)].* +FROM + ch07.hurricanes_nested +WHERE + season = '2018' +GROUP BY + sid, number, basin, name +ORDER BY number ASC +CREATE OR REPLACE TABLE ch07.hurricanes_nested_track AS + +SELECT sid, season, number, basin, name, + ARRAY_AGG( + STRUCT( + iso_time, + nature, + usa_sshs, + STRUCT(usa_latitude AS latitude, usa_longitude AS longitude, usa_wind AS wind, usa_pressure AS pressure) AS usa, + STRUCT(tokyo_latitude AS latitude, tokyo_longitude AS longitude, tokyo_wind AS wind, tokyo_pressure AS pressure) AS tokyo, + ... AS cma, + ... AS hko, + ... AS newdelhi, + ... AS reunion, + ... bom, + ... AS wellington, + ... nadi + ) ORDER BY iso_time ASC ) AS obs +FROM `bigquery-public-data`.noaa_hurricanes.hurricanes +GROUP BY sid, season, number, basin, name +sidseasonSELECT + number, name, basin, + (SELECT AS STRUCT iso_time, usa.latitude, usa.longitude, usa.wind + FROM UNNEST(obs) ORDER BY usa.wind DESC LIMIT 1).* +FROM ch07.hurricanes_nested_track +WHERE season = '2018' +ORDER BY number ASC +WITH hurricane_detail AS ( + +SELECT sid, season, number, basin, name, + ARRAY_AGG( + STRUCT( + iso_time, + nature, + usa_sshs, + STRUCT(usa_latitude AS latitude, usa_longitude AS longitude, usa_wind AS wind, usa_pressure AS pressure) AS usa, + STRUCT(tokyo_latitude AS latitude, tokyo_longitude AS longitude, tokyo_wind AS wind, tokyo_pressure AS pressure) AS tokyo + ) ORDER BY iso_time ASC ) AS obs +FROM `bigquery-public-data`.noaa_hurricanes.hurricanes +GROUP BY sid, season, number, basin, name +) + +SELECT + COUNT(sid) AS count_of_storms, + season +FROM hurricane_detail +GROUP BY season +ORDER BY season DESC + +seasonbigquery-public-data.github_repos.commitsrepo_namerepo_nameSELECT DISTINCT + visitId + , totals.pageviews + , totals.timeOnsite + , trafficSource.source + , device.browser + , device.isMobile + , h.page.pageTitle +FROM + `bigquery-public-data`.google_analytics_sample.ga_sessions_20170801, + UNNEST(hits) AS h +WHERE + totals.timeOnSite IS NOT NULL AND h.page.pageTitle = 'Shopping Cart' +ORDER BY pageviews DESC +LIMIT 10 +[1,2,3,4,5] +[1, +2 +3 +4 +5] +bigquery-public-data.utility_us.zipcode_areabigquery-public-data.utility_us.us_cities_areazipcode_geomcity_geomSELECT name, zipcode +FROM `bigquery-public-data`.utility_us.zipcode_area +JOIN `bigquery-public-data`.utility_us.us_cities_area +ON ST_INTERSECTS(ST_GeogFromText(zipcode_geom), city_geom) +WHERE name LIKE '%Santa Fe%' +ST_INTERSECTSST_GeogFromTextCREATE OR REPLACE TABLE ch07.zipcode_area AS +SELECT + * REPLACE(ST_GeogFromText(zipcode_geom) AS zipcode_geom) +FROM + `bigquery-public-data`.utility_us.zipcode_area +SELECT * REPLACESELECT *SELECT name, zipcode +FROM ch07.zipcode_area +JOIN `bigquery-public-data`.utility_us.us_cities_area +ON ST_INTERSECTS(zipcode_geom, city_geom) +WHERE name LIKE '%Santa Fe%' +utility_usbigquery-public-data.geo_us_boundaries.us_zip_codesSELECT + start_station_name + , AVG(duration) AS avg_duration +FROM `bigquery-public-data`.london_bicycles.cycle_hire +WHERE EXTRACT(YEAR from start_date) = 2015 +GROUP BY start_station_name +ORDER BY avg_duration DESC +LIMIT 5 +cycle_hire_2015CREATE OR REPLACE TABLE ch07eu.cycle_hire_2015 AS ( + SELECT * FROM `bigquery-public-data`.london_bicycles.cycle_hire + WHERE EXTRACT(YEAR from start_date) = 2015 +) +SELECT + start_station_name + , AVG(duration) AS avg_duration +FROM ch07eu.cycle_hire_2015 +GROUP BY start_station_name +ORDER BY avg_duration DESC +LIMIT 5 +SELECT + start_station_name + , AVG(duration) AS avg_duration +FROM `ch07eu.cycle_hire_*` +WHERE _TABLE_SUFFIX BETWEEN '2015' AND '2016' +GROUP BY start_station_name +ORDER BY avg_duration DESC +LIMIT 5 +CREATE OR REPLACE TABLE ch07eu.cycle_hire_partitioned + PARTITION BY DATE(start_date) AS +SELECT * FROM `bigquery-public-data`.london_bicycles.cycle_hire +CREATE OR REPLACE TABLE ch07eu.cycle_hire_partitioned + PARTITION BY DATE(start_date) + OPTIONS(partition_expiration_days=1000, + require_partition_filter=true) AS +SELECT * FROM `bigquery-public-data`.london_bicycles.cycle_hire +ALTER TABLE ch07eu.cycle_hire_partitioned +SET OPTIONS(require_partition_filter=true) +start_dateSELECT + start_station_name + , AVG(duration) AS avg_duration +FROM ch07eu.cycle_hire_partitioned +WHERE start_date BETWEEN '2015-01-01' AND '2015-12-31' +GROUP BY start_station_name +ORDER BY avg_duration DESC +LIMIT 5SELECT + start_station_name + , AVG(duration) AS avg_duration +FROM ch07eu.cycle_hire_partitioned +WHERE EXTRACT(YEAR FROM start_date) = 2015 +GROUP BY start_station_name +ORDER BY avg_duration DESC +LIMIT 5_PARTITIONTIME_PARTITIONDATE__UNPARTITIONED____UNPARTITIONED__NULL_PARTITIONTIMEcustomerIdstart_station_nameend_station_nameCREATE OR REPLACE TABLE ch07eu.cycle_hire_clustered + PARTITION BY DATE(start_date) + CLUSTER BY start_station_name, end_station_name +AS ( + SELECT * FROM `bigquery-public-data`.london_bicycles.cycle_hire +) +SELECT + start_station_name + , end_station_name + , AVG(duration) AS duration +FROM ch07eu.cycle_hire_clustered +WHERE + start_station_name LIKE '%Kennington%' + AND end_station_name LIKE '%Hyde%' +GROUP BY start_station_name, end_station_name +CLUSTER BY wiki, title +wikititleSELECT title, SUM(views) AS views +FROM `fh-bigquery.wikipedia_v3.pageviews_2017` +WHERE DATE(datehour) BETWEEN '2017-06-01' AND '2017-06-30' +AND wiki = 'en' +AND title LIKE '%Liberia%' +GROUP BY title +event_timeclustering_ratioMERGEMERGE ch07eu.cycle_hire_clustered all_hires +USING ch07eu.cycle_hire_corrections some_month +ON all_hires.start_station_name = some_month.start_station_name +WHEN MATCHED + AND all_hires._PARTITIONTIME = DATE(some_month.start_date) THEN + INSERT (rental_id, duration, ...) + VALUES (rental_id, duration, ...) +UPDATEUPDATE ch07eu.cycle_hire_clustered +SET start_station_id = 300 +WHERE start_station_id = 300 +AND start_date > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 DAY)SELECT * ... LIMIT 10SELECT * ... LIMIT 10zip_codestatezip_codezip_codestatezip_code0000099999stateorderscustomer_idcustomersSELECT o.* +FROM orders o +JOIN customers c USING (customer_id) +WHERE c.name = "Changying Bao" +orderscustomer_id// First look up the customer id. +// This scans only the small dimension table +SET id = SELECT customer_id FROM customers +WHERE c.name = "Changying Bao" +// Next look up the customer from the orders table. +// This will filter by the cluster column, +// and so only needs to read a small amount of data. +SELECT * FROM orders WHERE customer_id=$id ; +--batch flagBATCHINTERACTIVEBigQueryIOBigQueryIO.writeTableRows() + .to("project-id:dataset-id.table-id") + .withCreateDisposition( + BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) + .withMethod(Method.FILE_LOADS) + .withTriggeringFrequency(Duration.standardSeconds(600)) + .withNumFileShards(10) + .withSchema(new TableSchema()...) + .withoutValidation()) +dry_run diff --git a/08_advqueries/all_code.txt b/08_advqueries/all_code.txt new file mode 100644 index 0000000..7e008d1 --- /dev/null +++ b/08_advqueries/all_code.txt @@ -0,0 +1,1071 @@ + + query = """ + SELECT + start_station_name + , AVG(duration) as avg_duration + FROM + `bigquery-public-data`.london_bicycles.cycle_hire + WHERE + start_station_name LIKE CONCAT('%', @STATION, '%') + AND duration BETWEEN @MIN_DURATION AND @MAX_DURATION + GROUP BY start_station_name + """ +'%@STATION%', + + query_params = [ + bigquery.ScalarQueryParameter( + "STATION", "STRING", station_name), + bigquery.ScalarQueryParameter( + "MIN_DURATION", "FLOAT64", min_duration), + bigquery.ScalarQueryParameter( + "MAX_DURATION", "FLOAT64", max_duration), + ] + + + job_config = bigquery.QueryJobConfig() + job_config.query_parameters = query_params + query_job = client.query( + query, + location="EU", + job_config=job_config, + ) + for row in query_job: + print("{}: \t{}".format( + row.start_station_name, row.avg_duration)) + +def print_query_results(client, + station_name, + min_duration=0, + max_duration=84000): + +client = bigquery.Client() +print_query_results(client, 'Kennington', 300) +print_query_results(client, 'Hyde Park', 600, 6000) + +Kennington between 300 and 84000 +Kennington Oval, Oval: 1269.0798128928543 +Doddington Grove, Kennington: 1243.7377963737788 +Kennington Road Post Office, Oval: 1360.2854550952536 +Kennington Lane Rail Bridge, Vauxhall: 991.4344845855808 +Cleaver Street, Kennington: 1075.6050140700947 +Kennington Cross, Kennington: 996.2538654101008 +Kennington Road , Vauxhall: 1228.6673653660118 +Cotton Garden Estate, Kennington: 996.7003600110778 +Kennington Lane Tesco, Vauxhall: 929.6523615439942 +Kennington Station, Kennington: 1238.4088412072647 +______________________ +Hyde Park between 600 and 6000 +Bayswater Road, Hyde Park: 1614.2670577732417 +Wellington Arch, Hyde Park: 1828.9651324965134 +Hyde Park Corner, Hyde Park: 2120.4145144213744 +Cumberland Gate, Hyde Park: 1899.3282223532708 +Speakers' Corner 1, Hyde Park: 2070.2458069837776 +Triangle Car Park, Hyde Park: 1815.661582196573 +Albert Gate, Hyde Park: 1897.9349474341027 +Knightsbridge, Hyde Park: 1963.0815096317635 +Serpentine Car Park, Hyde Park: 1688.0595490490423 +Park Lane , Hyde Park: 2055.451932776309 +Speakers' Corner 2, Hyde Park: 2093.6202531645563 +______________________ +client.querydatetime.datetimefrom google.cloud import bigquery +from datetime import datetime +from datetime import timedelta +import pytz + +def print_query_results(client, mid_time): + start_time = mid_time - timedelta(minutes=30) + end_time = mid_time + timedelta(minutes=30) + + query = """ + SELECT + AVG(duration) as avg_duration + FROM + `bigquery-public-data`.london_bicycles.cycle_hire + WHERE + start_date BETWEEN @START_TIME AND @END_TIME + """ + query_params = [ + bigquery.ScalarQueryParameter( + "START_TIME", "TIMESTAMP", start_time), + bigquery.ScalarQueryParameter( + "END_TIME", "TIMESTAMP", end_time), + ] + job_config = bigquery.QueryJobConfig() + job_config.query_parameters = query_params + query_job = client.query( + query, + location="EU", + job_config=job_config, + ) + for row in query_job: + print(row.avg_duration) + print("______________________") + + +client = bigquery.Client() +print_query_results(client, + datetime(2015, 12, 25, 15, 0, tzinfo=pytz.UTC)) +3658.5000000000005 +@run_timeTIMESTAMP@run_dateDATEdef print_query_results(client, params): + query = """ + SELECT + start_station_name + , AVG(duration) as avg_duration + FROM + `bigquery-public-data`.london_bicycles.cycle_hire + WHERE + start_station_name LIKE CONCAT('%', ?, '%') + AND duration BETWEEN ? AND ? + GROUP BY start_station_name + """ + query_params = [ + bigquery.ScalarQueryParameter( + None, "STRING", params[0]), + bigquery.ScalarQueryParameter( + None, "FLOAT64", params[1]), + bigquery.ScalarQueryParameter( + None, "FLOAT64", params[2]), + ] +@STATIONSINUNNEST query = """ + SELECT + start_station_id + , COUNT(*) as num_trips + FROM + `bigquery-public-data`.london_bicycles.cycle_hire + WHERE + start_station_id IN UNNEST(@STATIONS) + AND duration BETWEEN @MIN_DURATION AND @MAX_DURATION + GROUP BY start_station_id + """ + query_params = [ + bigquery.ArrayQueryParameter( + 'STATIONS', "INT64", ids), + bigquery.ScalarQueryParameter( + 'MIN_DURATION', "FLOAT64", min_duration), + bigquery.ScalarQueryParameter( + 'MAX_DURATION', "FLOAT64", max_duration), + ] +stationsprint_query_results(client, [270, 235, 62, 149], 300, 600) +270: 26400 +149: 4143 +235: 8337 +62: 5954 +bigquery.StructQueryParameter( + "bicycle_trip", + bigquery.ScalarQueryParameter("start_station_id", "INT64", 62), + bigquery.ScalarQueryParameter("end_station_id", "INT64", 421), + ) + +--dry_run +CREATE TEMPORARY FUNCTION dayOfWeek(x TIMESTAMP) AS +( + ['Sun','Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'] + [ORDINAL(EXTRACT(DAYOFWEEK from x))] +); +CREATE TEMPORARY FUNCTION getDate(x TIMESTAMP) AS +( + EXTRACT(DATE FROM x) +); +WITH overnight_trips AS ( + SELECT + duration + , dayOfWeek(start_date) AS start_day + FROM + `bigquery-public-data`.london_bicycles.cycle_hire + WHERE + getDate(start_date) != getDate(end_date) +) +SELECT + start_day + , COUNT(*) AS num_overnight_rentals + , AVG(duration)/3600 AS avg_duration_hours +FROM + overnight_trips +GROUP BY + start_day +ORDER BY num_overnight_rentals DESC +ch08euCREATE OR REPLACE FUNCTION ch08eu.dayOfWeek(x TIMESTAMP) AS +( + ['Sun','Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'] + [ORDINAL(EXTRACT(DAYOFWEEK from x))] +); +CREATE FUNCTIONCREATE OR REPLACE FUNCTIONCREATE FUNCTION IF NOT EXISTSWITH overnight_trips AS ( + SELECT + duration + , ch08eu.dayOfWeek(start_date) AS start_day + FROM + `bigquery-public-data`.london_bicycles.cycle_hire + ... +bigquery.routines.listbigquery.routines.[create/get/update/delete] allAuthenticatedUsersAVGMEDIANCREATE OR REPLACE FUNCTION fhoffa.x.median (arr ANY TYPE) AS (( + SELECT IF (MOD(ARRAY_LENGTH(arr), 2) = 0, + ( arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2) - 1)] + + arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2))] ) / 2, + arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2))] + ) + FROM (SELECT ARRAY_AGG(x ORDER BY x) AS arr FROM UNNEST(arr) AS x) +)); +xfhoffaSELECT + start_station_name + , COUNT(*) AS num_trips + , fhoffa.x.median(ARRAY_AGG(tripduration)) AS typical_duration +FROM `bigquery-public-data`.new_york_citibike.citibike_trips -- london_bicycles.cycle_hire +GROUP BY start_station_name +HAVING num_trips > 1000 +ORDER BY typical_duration DESC +LIMIT 5 + +fhoffa.x SELECT + start_date, + COUNT(*) AS num_long_trips +FROM -- "first from" + (SELECT + start_station_name + , duration + , EXTRACT(DATE from start_date) AS start_date + FROM + `bigquery-public-data`.london_bicycles.cycle_hire + WHERE + start_station_name = end_station_name) AS roundtrips +WHERE -- "outer where" + duration > 2*( + SELECT + AVG(duration) as avg_duration + FROM + `bigquery-public-data`.london_bicycles.cycle_hire + WHERE + start_station_name = end_station_name + AND roundtrips.start_station_name = start_station_name + ) +GROUP BY start_date +ORDER BY num_long_trips DESC +LIMIT 5 + +FROMWHEREstart_station_namedurationend_station_nameWITH roundtrips AS ( +SELECT + start_station_name + , duration + , EXTRACT(DATE from start_date) AS start_date +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +WHERE + start_station_name = end_station_name +), + +station_avg AS ( +SELECT + start_station_name + , AVG(duration) as avg_duration +FROM + roundtrips +GROUP BY start_station_name +) + +SELECT + start_date, + COUNT(*) AS num_long_trips +FROM + roundtrips +JOIN station_avg USING(start_station_name) +WHERE duration > 2*avg_duration +GROUP BY start_date +ORDER BY num_long_trips DESC +LIMIT 5 +roundtripsstation_avgWITH params AS ( + SELECT 600 AS DURATION_THRESH +) +SELECT + start_station_name + , COUNT(duration) as num_trips +FROM + `bigquery-public-data`.london_bicycles.cycle_hire + , params +WHERE duration >= DURATION_THRESH +GROUP BY start_station_name +ORDER BY num_trips DESC +LIMIT 5 +WITHFROMDURATION_THRESHSELECT + bike_id, + COUNT(*) AS num_trips +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +GROUP BY + bike_id +ORDER BY + num_trips DESC +LIMIT + 100 +ORDER BYLIMITARRAY_AGG +WITH numtrips AS ( + SELECT + bike_id AS id, + COUNT(*) AS num_trips + FROM + `bigquery-public-data`.london_bicycles.cycle_hire + GROUP BY + bike_id +) + +SELECT + ARRAY_AGG(STRUCT(id,num_trips) + ORDER BY num_trips DESC LIMIT 100) + AS bike +FROM + numtrips +SELECT + ein + , ARRAY_AGG(STRUCT(elf, tax_pd, subseccd)) AS filing +FROM `bigquery-public-data`.irs_990.irs_990_2015 +WHERE ein BETWEEN '390' AND '399' +GROUP BY ein +LIMIT 3 +elf'E'SELECT + ein +FROM `bigquery-public-data`.irs_990.irs_990_2015 +WHERE elf != 'E' +SELECT + ein + , COUNTIF(elf = 'E', 1, 0) AS num_elf + , COUNTIF(elf = 'E', 0, 1) AS num_not_elf +FROM `bigquery-public-data`.irs_990.irs_990_2015 +GROUP BY ein +HAVING num_elf > 0 AND num_not_elf > 0 +ORDER BY num_elf DESC +LIMIT 3 +SELECT + ein +FROM + [TABLENAME] +WHERE + 'E' NOT IN (SELECT elf FROM UNNEST(filing)) +LIMIT 5UNNESTFROMelffiling'E' SELECT + ein +FROM + [TABLENAME] +WHERE + EXISTS (SELECT elf FROM UNNEST(filing) WHERE elf != 'E') +LIMIT 5 +SELECT + GENERATE_DATE_ARRAY('2019-06-23', '2019-08-22', INTERVAL 10 DAY) AS summer +UNNESTdayssummerFROMUNNESTFROMWITH days AS ( + SELECT + GENERATE_DATE_ARRAY('2019-06-23', '2019-08-22', INTERVAL 10 DAY) AS summer +) +SELECT summer_day +FROM days, UNNEST(summer) AS summer_day +CROSS JOINNULLLEFT JOINFROM days LEFT JOIN UNNEST(summer) AS summer_day +SELECT ['Lak', 'Jordan', 'Graham'] AS minions +daynoWITH days AS ( + SELECT + GENERATE_DATE_ARRAY('2019-06-23', '2019-08-22', + INTERVAL 10 DAY) AS summer, + ['Lak', 'Jordan', 'Graham'] AS minions +) + +SELECT + summer[ORDINAL(dayno)] AS summer_day + , minions[OFFSET(MOD(dayno, + ARRAY_LENGTH(minions)))] + AS minion +FROM + days, UNNEST(GENERATE_ARRAY(1,ARRAY_LENGTH(summer),1)) dayno +ORDER BY summer_day ASC +ORDINALOFFSETARRAY_LENGTHsummerARRAY_CONCATSELECT + ARRAY_CONCAT( + GENERATE_DATE_ARRAY('2019-03-23', '2019-06-22', INTERVAL 20 DAY) + , GENERATE_DATE_ARRAY('2019-08-23', '2019-11-22', INTERVAL 20 DAY) + ) AS shoulder_season +ARRAY_TO_STRINGSELECT +ARRAY_TO_STRING(['A', 'B', NULL, 'D'], '*', 'na') AS arr +ARRAY_TO_STRINGNULLSELECT +TO_JSON_STRING( + GENERATE_DATE_ARRAY('2019-06-23', '2019-08-22', + INTERVAL 10 DAY)) AS json +SELECT +TO_JSON_STRING([ + STRUCT(1 AS a, 'bbb' AS b), + STRUCT(2 AS a, 'ccc' AS b) +]) AS json +minionsGENERATE_ARRAY +GENERATE_DATE_ARRAY +SELECT +GENERATE_ARRAY(10, 20, 3) +OFFSET +ORDINAL +SELECT minions[OFFSET(0)] FROM … +SELECT minions[ORDINAL(1)] FROM ... +ARRAY_LENGTH +SELECT ARRAY_LENGTH(minions) +UNNEST +WITH workers AS ( + SELECT ['Lak', 'Jordan', 'Graham'] + AS minions +) +SELECT m +FROM workers, UNNEST(minions) AS m +IN + +WITH workers AS ( + SELECT ['Lak', 'Jordan', 'Graham'] + AS minions +) +SELECT 'Lak' IN UNNEST(minions) +FROM workers +EXISTS +WITH workers AS ( + SELECT ['Lak', 'Jordan', 'Graham'] AS minions + UNION ALL SELECT [] AS minions +) + +SELECT + EXISTS (SELECT * FROM + UNNEST(minions)) +FROM workers +ARRAY_AGG +SELECT + ein + , ARRAY_AGG(elf) AS elf +FROM `bigquery-public-data`.irs_990.irs_990_2015 +GROUP BY ein +LIMIT 3 +ARRAY_CONCAT +SELECT + ARRAY_CONCAT( ['A', 'B', 'C'], + ['D', 'E', 'F']) +ARRAY_TO_STRING +TO_JSON_STRING +SELECT TO_JSON_STRING([ + STRUCT(1 AS a, 'bbb' AS b), + STRUCT(2 AS a, 'ccc' AS b) +]) +SELECT + MAX(duration) AS longest_duration + , COUNT(*) AS num_trips + , AVG(duration) AS average_duration +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +SELECT + AVG(duration) + OVER(ORDER BY start_date ASC + ROWS BETWEEN 100 PRECEDING AND 1 PRECEDING) + AS average_duration +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +LIMIT 5 +AVG(duration)OVERstart_dateaverage_durationnullROWS BETWEEN 50 PRECEDING AND 50 FOLLOWINGROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROWROWS 50 PRECEDING +ROWS BETWEEN 50 PRECEDING AND CURRENT ROW +PARTITION BYOVER()SELECT + AVG(duration) + OVER(PARTITION BY start_station_id + ORDER BY start_date ASC + ROWS BETWEEN 100 PRECEDING AND 1 PRECEDING) + AS average_duration +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +LIMIT 5 +RANGESELECT + AVG(duration) + OVER(PARTITION BY start_station_id + ORDER BY UNIX_SECONDS(start_date) ASC + RANGE BETWEEN 3600 PRECEDING AND CURRENT ROW) + AS average_duration +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +LIMIT 5 +OVER()bike_idstart_dateLAST_VALUESELECT + start_date + , end_date + , LAST_VALUE(start_date) + OVER(PARTITION BY bike_id + ORDER BY start_date ASC + ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING) + AS next_rental_start +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +LIMIT 5 +SELECT + start_date + , end_date + , LEAD(start_date, 1) + OVER(PARTITION BY bike_id + ORDER BY start_date ASC ) + AS next_rental_start +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +LIMIT 5 +LEADLAST_VALUELEADFIRST_VALUELAGNTH_VALUESELECT + start_station_id + , duration + , RANK() + OVER(PARTITION BY start_station_id ORDER BY duration DESC) + AS nth_longest +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +LIMIT 5 +RANKSELECTWITHstart_station_idARRAY_AGGWITH longest_trips AS ( + SELECT + start_station_id + , duration + , RANK() + OVER(PARTITION BY start_station_id ORDER BY duration DESC) + AS nth_longest + FROM + `bigquery-public-data`.london_bicycles.cycle_hire +) + +SELECT + start_station_id + , ARRAY_AGG(duration ORDER BY nth_longest LIMIT 3) AS durations +FROM + longest_trips +GROUP BY start_station_id +LIMIT 5 +RANK(),DENSE_RANK()ROW_NUMBER()WITH example AS ( + SELECT 'A' AS name, 32 AS age + UNION ALL SELECT 'B', 32 + UNION ALL SELECT 'C', 33 + UNION ALL SELECT 'D', 33 + UNION ALL SELECT 'E', 34 +) + +SELECT + name + , age + , RANK() OVER(ORDER BY age) AS rank + , DENSE_RANK() OVER(ORDER BY age) AS dense_rank + , ROW_NUMBER() OVER(ORDER BY age) AS row_number +FROM example +RANK()DENSE_RANK()ROW_NUMBER()SELECT + ein + , ARRAY_AGG(STRUCT(elf, tax_pd, subseccd)) AS filing +FROM `bigquery-public-data`.irs_990.irs_990_2015 +GROUP BY ein +elftax_pdsubseccdirs_990_2015irs_990 SELECT column_name +FROM `bigquery-public-data`.irs_990.INFORMATION_SCHEMA.COLUMNS +WHERE table_name = 'irs_990_2015' +WITH columns AS ( + SELECT column_name + FROM `bigquery-public-data`.irs_990.INFORMATION_SCHEMA.COLUMNS + WHERE table_name = 'irs_990_2015' AND column_name != 'ein' +) + +SELECT CONCAT( + 'SELECT ein, ARRAY_AGG(STRUCT(', + ARRAY_TO_STRING(ARRAY(SELECT column_name FROM columns), ',\n '), + '\n) FROM `bigquery-public-data`.irs_990.irs_990_2015\n', + 'GROUP BY ein') +SELECT ein, ARRAY_AGG(STRUCT(ein, + elf, + tax_pd, + subseccd, + ... + othrinc509, + totsupp509 +) FROM `bigquery-public-data`.irs_990.irs_990_2015 +GROUP BY ein +component:salesportalteam:emeasalesenvironment:productionstate:validateddevelopmentstagingtestproductionch08euALTER TABLE SET OPTIONSbq update --set_label costcenter:abc342 ch08eu +bq update --set_label costcenter:def456 ch08eu +bq query --label costcenter:def456 --nouse_legacy_sql 'SELECT ...' +bq ls --filter 'labels.costcenter:def456' + datasetId + ----------- + ch08eu +SYSTEM_TIMESELECT + * +FROM `bigquery-public-data`.london_bicycles.cycle_stations +FOR SYSTEM_TIME AS OF + TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 6 HOUR) +CREATE OR REPLACE TABLE ch08eu.hydepark_stations AS +SELECT + * EXCEPT(longitude, latitude) + , ST_GeogPoint(longitude, latitude) AS location +FROM `bigquery-public-data`.london_bicycles.cycle_stations +WHERE name LIKE '%Hyde%' +CREATE OR REPLACE TABLE ch08eu.hydepark_stations +OPTIONS( + expiration_timestamp=TIMESTAMP "2020-01-01 00:00:00 UTC", + description="Stations with Hyde Park in the name", + labels=[("cost_center", "abc123")] +) AS +SELECT + * EXCEPT(longitude, latitude) + , ST_GeogPoint(longitude, latitude) AS location +FROM `bigquery-public-data.london_bicycles.cycle_stations` +WHERE name LIKE '%Hyde%'CREATE OR REPLACE TABLE ch08eu.hydepark_rides +( + start_time TIMESTAMP, + duration INT64, + start_station_id INT64, + start_station_name STRING, + end_station_id INT64, + end_station_name STRING +) +PARTITION BY DATE(start_time) +CLUSTER BY start_station_id +ALTER TABLE ch08eu.hydepark_rides +SET OPTIONS( + expiration_timestamp=TIMESTAMP "2021-01-01 00:00:00 UTC", + require_partition_filter=True, + labels=[("cost_center", "def456")] +) +UPDATEMERGEcycle_hireshydepark_ridesINSERT ch08eu.hydepark_rides +SELECT + start_date AS start_time + , duration + , start_station_id + , start_station_name + , end_station_id + , end_station_name +FROM + `bigquery-public-data`.london_bicycles.cycle_hire +WHERE + start_station_name LIKE '%Hyde%' +WITH rides_in_year AS ( +SELECT + EXTRACT(MONTH from start_time) AS month + , duration +FROM ch08eu.hydepark_rides +WHERE + DATE(start_time) BETWEEN '2016-01-01' AND '2016-12-31' + AND start_station_id = 300 + AND end_station_id = 303 +) + +SELECT + month + , AVG(duration)/60 as avg_duration_minutes +FROM rides_in_year +GROUP BY month +ORDER BY avg_duration_minutes DESC +LIMIT 5 +require_partition_filter=True CLUSTER BY start_station_id.INSERTINSERT ch08eu.hydepark_rides ( + start_time + , duration + , start_station_id + , start_station_name + , end_station_id + , end_station_name +) +VALUES +('2016-02-18 17:21:00 UTC', 720, 300, +'Serpentine Car Park, Hyde Park', 303, 'Albert Gate, Hyde Park'), +('2016-02-18 16:30:00 UTC', 1320, 300, +'Serpentine Car Park, Hyde Park', 303, 'Albert Gate, Hyde Park') +'Albert Gate, Hyde Park' 'Hyde Park: Albert Gate'... +VALUES +('2016-02-18 17:21:00 UTC', 720, + 300, (SELECT name FROM `bigquery-public-data`.london_bicycles.cycle_stations WHERE id = 300), + 303, (SELECT name FROM `bigquery-public-data`.london_bicycles.cycle_stations WHERE id = 303)), +... +CREATE TEMPORARY FUNCTION stationName(stationId INT64) AS( + (SELECT name FROM + `bigquery-public-data`.london_bicycles.cycle_stations + WHERE id = stationId) +); +DELETE ch08eu.hydepark_rides +WHERE + start_time > '2016-12-01' AND + (duration IS NULL OR duration = 0)DELETE ch08eu.hydepark_rides +WHERE + userId = 3452123 +MERGEUPDATE ch08eu.hydepark_rides +SET duration = duration * 60 +WHERE + start_time > '2016-12-01' AND + start_station_id = 303maintenanceUPDATE ch08eu.stations_table +SET maintenance = ARRAY_CONCAT(maintenance, + ARRAY_STRUCT