Investigating Internet Shutdowns with Google Traffic stats

Requirements:

  • numpy
  • pandas
  • requests
In [ ]:
 
In [ ]:
import os
import json

import pandas as pd
import numpy as np
import requests
from datetime import datetime, timedelta
import plotly.plotly as py
import matplotlib.pyplot as plt
import cufflinks as cf
import statsmodels.api as sm
cf.go_offline()
%matplotlib inline
In [ ]:
 

The code below is needed to retrieve google transparency data.

You can skip this section, just be sure to run it.

In [ ]:
# The functions below are a python port of the official gwt library.
# see: https://github.com/gwtproject/gwt/blob/master/user/src/com/google/gwt/user/client/rpc/impl/AbstractSerializationStream.java

def long_from_base64(value):
    pos = 0
    long_val = base64_value(value[pos])
    pos += 1
    while pos < len(value):
        long_val <<= 6
        long_val |= base64_value(value[pos])
        pos += 1
    return long_val;

def long_to_base64(value):
    # Convert to ints early to avoid need for long ops
    low = value & 0xffffffff
    high = value >> 32
    sb = ''
    hnz = False # Have Non zero
    hnz, s = base64_digit((high >> 28) & 0xf, hnz)
    sb += s
    hnz, s = base64_digit((high >> 22) & 0x3f, hnz)
    sb += s
    hnz, s = base64_digit((high >> 16) & 0x3f, hnz)
    sb += s
    hnz, s = base64_digit((high >> 10) & 0x3f, hnz)
    sb += s
    hnz, s = base64_digit((high >> 4) & 0x3f, hnz)
    sb += s
    v = ((high & 0xf) << 2) | ((low >> 30) & 0x3)
    hnz, s = base64_digit(v, hnz)
    sb += s
    hnz, s = base64_digit((low >> 24) & 0x3f, hnz)
    sb += s
    hnz, s = base64_digit((low >> 18) & 0x3f, hnz)
    sb += s
    hnz, s = base64_digit((low >> 12) & 0x3f, hnz)
    sb += s
    hnz, s = base64_digit((low >> 6) & 0x3f, hnz)
    sb += s
    hnz, s = base64_digit(low & 0x3f, hnz)
    sb += s
    return sb
    
def base64_digit(digit, have_non_zero):
    if digit > 0:
        have_non_zero = True
    if have_non_zero:
        if (digit < 26):
            return (have_non_zero, chr(ord('A') + digit))
        elif (digit < 52):
            return (have_non_zero, chr(ord('a') + digit - 26))
        elif (digit < 62):
            return (have_non_zero, chr(ord('0') + digit - 52))
        elif (digit == 62):
            return (have_non_zero, '$')
        else:
            return (have_non_zero, '_')
    return (have_non_zero, '')

def base64_value(char):
    # Assume digit is one of [A-Za-z0-9$_]
    if (char >= 'A' and char <= 'Z'):
        return ord(char) - ord('A')
    # No need to check digit <= 'z'
    if (char >= 'a'):
        return ord(char) - ord('a') + 26
    if (char >= '0' and char <= '9'):
        return ord(char) - ord('0') + 52
    if (char == '$'):
        return 62
    # digit == '_'
    return 63
assert long_to_base64(long_from_base64('U3Ay4uu')) == 'U3Ay4uu'

def dt_to_gwt_b64(dt):
    """
    Convert a date to gwt base64 format (string)
    """
    epoch = datetime.utcfromtimestamp(0)
    ms_since_epoch = int((dt - epoch).total_seconds()*1000)
    return long_to_base64(ms_since_epoch)
def gwt_b64_to_dt(s):
    """
    Takes a gwt base64 format and makes it into a date
    """
    ms_since_epoch = long_from_base64(s)
    return datetime.utcfromtimestamp(ms_since_epoch/1000)

def parse_time_series(response):
    """
    This is an attempt at reversing their format.
    Not all works and sometimes it breaks unexpectedly.
    """
    series = {
        'start_time': gwt_b64_to_dt(response[2]),
        'end_time': gwt_b64_to_dt(response[0])
    }
    timestamps = []
    values = []
    count = None
    idx = 5
    while True:
        value = response[idx]
        dtype = response[idx+1]
        if dtype == 13:
            values.append(value)
        elif dtype == 8:
            count = value
            idx += 2
            break
        idx += 2
    offset = 0
    while True:
        #print response[offset+idx:offset+idx+10]
        value = response[offset+idx]
        dtype = response[offset+idx+1]
        if value == -3:
            # XXX this appears to happen when a value is missing
            if len(timestamps) > 2:
                timestamps.append(timestamps[-1] + (timestamps[-2] - timestamps[-1]))
            offset += 1
            continue
        if dtype != 3:
            break
        #assert dtype == 3, "dtype %s != 3" % dtype
        dt = gwt_b64_to_dt(value)
        if dt < series['start_time']:
            break
        #print(dt)
        timestamps.append(dt)
        offset += 2

    series['count'] = count
    series['timestamps'] = timestamps
    series['values'] = values[:len(timestamps)]
    return series

def get_time_ranges(start_time, end_time, days=7):
    if end_time - start_time <= timedelta(days=days):
        return [
            (start_time, end_time)
        ]
    time_ranges = []
    range_start = start_time
    range_end = range_start + timedelta(days=days)
    while range_end <= end_time:
        time_ranges.append((range_start, range_end))
        range_start = range_end
        range_end += timedelta(days=days)
    if range_end > end_time:
        range_end = end_time
    time_ranges.append((range_start, range_end))
    return time_ranges

GOOGLE_ALPHA_2_CODES = {'BD': 20, 'BE': 21, 'BF': 22, 'BG': 23, 'BA': 18, 'BB': 19, 'BM': 28, 'BN': 29, 
                        'BO': 30, 'BH': 24, 'BI': 25, 'BJ': 26, 'BT': 34, 'JM': 123, 'BW': 37, 'WS': 262, 
                        'BR': 32, 'BS': 33, 'JE': 122, 'BY': 38, 'BZ': 39, 'RU': 205, 'RW': 206, 'RS': 204, 
                        'TL': 237, 'RE': 202, 'TM': 238, 'TJ': 235, 'RO': 203, 'GU': 102, 'GT': 101, 'GR': 99, 
                        'GQ': 98, 'GP': 97, 'JP': 125, 'GY': 104, 'GG': 91, 'GF': 90, 'GE': 89, 'GD': 88, 'GB': 87, 
                        'GA': 86, 'SV': 225, 'GN': 96, 'GM': 95, 'GL': 94, 'GI': 93, 'GH': 92, 'OM': 184, 'TN': 239, 
                        'JO': 124, 'HR': 108, 'HT': 109, 'HU': 110, 'HK': 105, 'HN': 107, 'VE': 256, 'PR': 194, 
                        'PS': 195, 'PW': 197, 'PT': 196, 'PY': 198, 'IQ': 118, 'PA': 185, 'PF': 187, 'PG': 188, 
                        'PE': 186, 'PK': 190, 'PH': 189, 'PL': 191, 'PM': 192, 'ZM': 272, 'EE': 71, 'EG': 72, 
                        'ZA': 271, 'EC': 70, 'IT': 121, 'VN': 259, 'SB': 208, 'ET': 76, 'ZW': 274, 'SA': 207, 
                        'ES': 75, 'ME': 151, 'MD': 150, 'MG': 153, 'MA': 148, 'MC': 149, 'UZ': 253, 'MM': 157, 
                        'ML': 156, 'MO': 159, 'MN': 158, 'MH': 154, 'MK': 155, 'MU': 165, 'MT': 164, 'MW': 167, 
                        'MV': 166, 'MQ': 161, 'MP': 160, 'MS': 163, 'MR': 162, 'IM': 115, 'UG': 248, 'TZ': 246, 
                        'MY': 169, 'MX': 168, 'IL': 114, 'FR': 84, 'FI': 79, 'FJ': 80, 'FO': 83, 'NI': 176, 
                        'NL': 177, 'NO': 178, 'SO': 220, 'VU': 260, 'NC': 172, 'NE': 173, 'NF': 174, 'NG': 175, 
                        'NZ': 183, 'NP': 179, 'NR': 180, 'CK': 47, 'CI': 46, 'CH': 45, 'CO': 51, 'CN': 50, 'CM': 49, 
                        'CL': 48, 'CA': 40, 'CG': 44, 'CF': 43, 'CD': 42, 'CZ': 60, 'CY': 59, 'CR': 53, 'CV': 56, 
                        'CU': 55, 'SZ': 228, 'SY': 227, 'KG': 127, 'KE': 126, 'SR': 221, 'KI': 129, 'KH': 128, 
                        'KN': 131, 'ST': 223, 'SK': 216, 'KR': 133, 'SI': 214, 'SH': 213, 'KW': 134, 'SN': 219, 
                        'SL': 217, 'SC': 209, 'KZ': 136, 'KY': 135, 'SG': 212, 'SE': 211, 'SD': 210, 'DO': 67, 
                        'DM': 66, 'DJ': 64, 'DK': 65, 'VG': 257, 'DE': 62, 'YE': 268, 'DZ': 68, 'US': 251, 'UY': 252, 
                        'YT': 269, 'LB': 138, 'LC': 139, 'LA': 137, 'TV': 244, 'TW': 245, 'TT': 243, 'TR': 242, 
                        'LK': 141, 'LI': 140, 'LV': 146, 'TO': 240, 'LT': 144, 'LU': 145, 'LR': 142, 'LS': 143, 
                        'TH': 234, 'TG': 233, 'TD': 231, 'TC': 230, 'LY': 147, 'VC': 255, 'AE': 2, 'AD': 1, 'AG': 4, 
                        'AF': 3, 'AI': 5, 'VI': 258, 'IS': 120, 'IR': 119, 'AM': 7, 'AL': 6, 'AO': 9, 'AR': 11, 
                        'AU': 14, 'AT': 13, 'AW': 15, 'IN': 116, 'AX': 16, 'AZ': 17, 'IE': 113, 'ID': 112, 'UA': 247, 
                        'QA': 199, 'MZ': 170}
def get_code_from_alpha2(alpha_2):
    try:
        return GOOGLE_ALPHA_2_CODES[alpha_2]
    except:
        raise RuntimeError("Country not supported")

def get_metrics_with_date(prod_code, region_code, start_time, end_time):
    print("Getting metrics for %s - %s" % (start_time, end_time))
    headers = {
        'x-client-data': 'CJG2yQEIprbJAQjBtskBCPucygEIqZ3KAQ==',
        'x-gwt-module-base': 'https://www.google.com/transparencyreport/gwt/',
        'x-gwt-permutation':'DFD0EBA544B633919D593657A1CFAC69',
        'content-type': 'text/x-gwt-rpc; charset=UTF-8',
        'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6'
    }
    data = ('7|0|11|https://www.google.com/transparencyreport/gwt/|A95F82F4A46F68F8F3518C8811783D00|'
            'com.google.analysis.gblocked.traffic.frontend.shared.TrafficService|evaluateQuery|'
            'com.google.analysis.gblocked.traffic.frontend.shared.TrafficRequest/1877719668|'
            'com.google.analysis.gblocked.traffic.common.TimeSeries/3457781141|'
            'com.google.analysis.gblocked.traffic.common.Logsource/3745169662|'
            'com.google.i18n.identifiers.RegionCode/1527795405|'
            'com.google.analysis.gblocked.traffic.frontend.shared.Zoom/2138134534|'
            'com.google.analysis.gblocked.traffic.frontend.shared.Zoom$Source/3263501257|'
            'java.util.Date/3385151746|'
     '1|2|3|4|1|5|5|598|4|6|7|{prod_code}|8|{region_code}|9|10|1|11|{end_time}|11|{start_time}|'.format(
        prod_code=prod_code,
        region_code=region_code,
        start_time=dt_to_gwt_b64(start_time),
        end_time=dt_to_gwt_b64(end_time)
    ))
    r = requests.post('https://www.google.com/transparencyreport/gwt/trafficService', headers=headers, data=data)
    return r
def parse_response(data):
    """
    Do our best to parse the response
    """
    return json.loads(data[4:].replace("'", '"').replace('\\x', '\\u00'))
def get_data_for_product(region_code, prod_code, start_time, end_time):
    timestamps = []
    values = []
    for st, et in get_time_ranges(start_time, end_time):
        r = get_metrics_with_date(prod_code, region_code, st, et)
        response = parse_response(r.text)
        ts = parse_time_series(response)
        for idx, timestamp in enumerate(ts['timestamps']):
            if timestamp < start_time or timestamp > end_time:
                continue
            timestamps.append(timestamp)
            values.append(ts['values'][idx])
    return timestamps, values

Below are the functions you should actually be calling

In [ ]:
GOOGLE_PRODUCT_CODES = {
    'Blogger': 1,
    'Gmail': 2,
    'Google Books': 4,
    'Google Docs': 5,
    'Google Earth': 6,
    'Google Groups': 7,
    'Google Images': 8,
    'Google Maps': 9,
    'Google Search': 12,
    'Google Sites': 13,
    'Google Spreadsheets': 14,
    'Google Translate': 16,
    'Google Videos': 17,
    'Orkut': 18,
    'Picasa Web Albums': 19,
    'Traffic Graph': 0,
    'YouTube': 21
}
def get_df_traffic_for_country(alpha_2, start_time, end_time=datetime.utcnow()):
    """
    This function takes as input a country code as alpha2 and returns a pandas
    dataframe with all the traffic data for it.
    """
    result = {
        'timestamps': None
    }
    region_code = get_code_from_alpha2(alpha_2)
    for prod_name, prod_code in GOOGLE_PRODUCT_CODES.items():
        if prod_code not in [2, 21, 9, 8, 12]:
            # We only care about: Gmail, Youtube, Maps, Images, Search
            continue
        print("Getting %s - %s" % (prod_name, alpha_2))
        try:
            timestamps, values = get_data_for_product(region_code, prod_code, start_time, end_time)
            if result['timestamps']:
                assert result['timestamps'] == timestamps
            else:
                result['timestamps'] = timestamps
        except Exception as exc:
            print("MISSING DATA")
            print(exc)
            continue
        result[prod_name] = values
    df = pd.DataFrame(result)
    df.drop_duplicates(subset='timestamps', keep='last', inplace=True)
    df.set_index('timestamps', inplace=True)
    return df.sort_index()

Finding evidence of shutdown in Ethiopia in 2016

In [ ]:
df_et = get_df_traffic_for_country('ET', datetime(2016, 6, 1, 0, 0), datetime(2016, 8, 30, 0, 0))
In [ ]:
 

We use as a metric google search, that seems to be the most stationary metric for every country (apart from seasonality variations)

In [ ]:
target_metric = 'Google Search'
In [ ]:
df = df_et.reset_index()[['Google Search', 'timestamps']]
df.columns = ['y', 'ds']
In [ ]:
df.head()

The models require the following extra dependencies:

  • plotly
  • cufflinks

For the Facebook Prophet based model:

For the ARIMA model:

  • statsmodels
In [ ]:
 

fbprophet model

This first model uses fbprophet an "Automatic Forecasting Procedure" developped by facebook. The paper to be used as reference is: https://facebookincubator.github.io/prophet/static/prophet_paper_20170113.pdf.

In order to support daily periods you will have to use my branch of it available here: https://github.com/hellais/prophet/tree/feature/daily-seasonality

In [ ]:
from fbprophet import Prophet
In [ ]:
pm = Prophet(daily_seasonality=True)
pm.fit(df)
In [ ]:
 
In [ ]:
predictions = pm.predict(df[['ds', 'y']])
In [ ]:
predictions.set_index('ds')[['y', 'yhat', 'yhat_lower', 'yhat_upper']].iplot()
In [ ]:
fbp_model = predictions.set_index('ds')[['y', 'yhat', 'yhat_lower', 'yhat_upper']]
fbp_model['anomaly_factor'] = ((fbp_model['yhat'] - fbp_model['y'])/100)
layout_fbp = {'shapes': []}
anomalies = []
anomaly_start = None
trigger_factor = 0.4
for dt, row in fbp_model.iterrows():
    if row.anomaly_factor and row.anomaly_factor > trigger_factor:
        if anomaly_start is None:
            anomaly_start = dt
    elif anomaly_start is not None:
        anomalies.append((anomaly_start, dt))
        anomaly_start = None
for start, end in anomalies:
    layout_fbp['shapes'].append({
            'type': 'rect',
            'xref': 'x',
            'yref': 'paper',
            'x0': start,
            'y0': 0,
            'x1': end,
            'y1': 1,
            'fillcolor': 'red',
            'opacity': 0.5,
            'line': {
                'width': 0
            }
        })
In [ ]:
fbp_model.iplot(layout=layout_fbp)
In [ ]:
 
In [ ]:
 
In [ ]:
 

ARIMA based model

We build a "Seasonal AutoRegressive Integrated Moving Average with eXogenous regressors model" and test the actual value to the value of the model. We use as s to seasonal_order 48 since the period of the data is 48 (48 ticks per day).

In [ ]:
arimam = sm.tsa.SARIMAX(df.set_index('ds')['y'], order=(1,0,0), seasonal_order=(1,1,0,48)).fit()
In [ ]:
arima_model = df.set_index('ds')

start_time = (arima_model.index[0] + timedelta(days=1)).strftime('%Y-%m-%d')
end_time = arima_model.index[-1].strftime('%Y-%m-%d')
arima_model['ypred'] = arimam.predict(start_time, end_time, dynamic=True)

If we see an anomaly_factor that is greater than 0.4 (positive means it's a downrise), we will color that region red indicating a possible blackout event.

In [ ]:
arima_model['anomaly_factor'] = ((arima_model['ypred'] - arima_model['y'])/100)
layout_arima = {'shapes': []}
anomalies = []
anomaly_start = None
trigger_factor = 0.4
for dt, row in arima_model.iterrows():
    if row.anomaly_factor and row.anomaly_factor > trigger_factor:
        if anomaly_start is None:
            anomaly_start = dt
    elif anomaly_start is not None:
        anomalies.append((anomaly_start, dt))
        anomaly_start = None
for start, end in anomalies:
    layout_arima['shapes'].append({
            'type': 'rect',
            'xref': 'x',
            'yref': 'paper',
            'x0': start,
            'y0': 0,
            'x1': end,
            'y1': 1,
            'fillcolor': 'red',
            'opacity': 0.5,
            'line': {
                'width': 0
            }
        })

Plot it!

In [ ]:
arima_model.iplot(layout=layout_arima)
In [ ]: