This repository has been archived by the owner on Sep 18, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbigquery.py
36 lines (32 loc) · 1.46 KB
/
bigquery.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from google.cloud import bigquery
from googleapiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
from google.oauth2 import service_account
from os import environ
import pandas_gbq
# There are ~100,000 sessions involving search per day
# and the estimated results size is around 1GB per week of data
# TODO: measure the actual size
query = '''
SELECT
CONCAT(fullVisitorId,'|',CAST(visitId as STRING)) AS sessionId,
customDimensions.value as searchTerm,
hits.hitNumber as hitNumber, -- This is the hit number of the results page (for impressions) or the page itself (for clicks)
product.productSKU as contentIdOrPath,
product.productListPosition as linkPosition,
CASE
WHEN product.isImpression = true and product.isClick IS NULL THEN 'impression'
WHEN product.isClick = true and product.isImpression IS NULL THEN 'click'
ELSE NULL
END AS observationType
FROM `govuk-bigquery-analytics.87773428.ga_sessions_*`
CROSS JOIN UNNEST(hits) as hits
CROSS JOIN UNNEST(hits.product) as product
CROSS JOIN UNNEST(product.customDimensions) as customDimensions
WHERE product.productListName = 'Site search results'
AND _TABLE_SUFFIX BETWEEN '20180422' AND '20180425'
AND product.productListPosition <= 20
AND customDimensions.index = 71
'''
results = pandas_gbq.read_gbq(query, project_id='govuk-bigquery-analytics', private_key='govuk_bigquery.json', dialect='standard')
results.to_csv('data/bigquery_results_20180422_20180425.csv', index=False)