Revision control
Copy as Markdown
#!/usr/bin/env python3
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
import argparse
import csv
import datetime
import os
import sys
import xml.etree.ElementTree as ET
from matplotlib import pyplot as plt
from config import *
"""
Get the data from this query:
"""
ROW_HEIGHT = 10
ZERO_LENGTH = 1
DUPLICATE_TIME = 2
NO_BASELINE = 3
MISSING_SEQ = 4
DUPLICATE_SEQ = 5
TOO_LATE = 6
FAR_FROM_4AM = 7
MAX_NOTES = 8
NOTE_SUMMARIES = {
ZERO_LENGTH: "metrics ping had start/end time of < 1 minute",
DUPLICATE_TIME: "2 or more metrics pings were collected within the same minute",
NO_BASELINE: "a metrics ping was collected with no baseline ping since the last metrics ping",
MISSING_SEQ: "the seq number is not contiguous with the previous ping",
DUPLICATE_SEQ: "the same seq number was used more than once",
TOO_LATE: "the metrics ping was collected more than 24 hours after the last baseline ping",
FAR_FROM_4AM: "the metrics ping was sent more than an hour from 4am local time",
}
def load_data(filename):
"""
Load the csv file and convert it to a list of dicts.
"""
print("Loading CSV")
data = []
with open(filename) as fd:
reader = csv.reader(fd)
column_names = next(reader)
for row in reader:
data.append(dict((name, value) for (name, value) in zip(column_names, row)))
return data
def parse_version(build_id):
"""
Parse the "date-like" string out of the Fenix Nightly version convention.
Returns `None` if no version found.
"""
if build_id.startswith('"'):
build_id = build_id[1:-1]
if build_id.startswith("Nightly"):
parts = build_id.split()
date = int(parts[1])
return date
return None
def filter_data(data):
"""
Remove pings that are too old.
"""
now = datetime.datetime.now() + datetime.timedelta(days=1)
result = [
x
for x in data
if (
get_local_time(x["start_time"]) >= FIRST_DATE
and get_local_time(x["end_time"]) >= FIRST_DATE
and get_local_time(x["start_time"]) < now
and get_local_time(x["end_time"]) < now
)
]
print(f"Removed {len(data)-len(result)}/{len(data)} pings with out-of-range dates")
return result
def annotate_data(data):
"""
Add some derived values to the data set.
"""
print("Annotating CSV")
for ping in data:
ping["start_time_tz"] = get_timezone(ping["start_time"])
ping["end_time_tz"] = get_timezone(ping["end_time"])
ping["start_time_local"] = get_local_time(ping["start_time"])
ping["end_time_local"] = get_local_time(ping["end_time"])
ping["start_time_hour"] = get_fractional_hour(ping["start_time_local"])
ping["end_time_hour"] = get_fractional_hour(ping["end_time_local"])
ping["version_date"] = parse_version(ping["app_version"])
ping["notes"] = set()
def sort_data_by_client_id(data):
"""
Reorganize the data so it is grouped by client id.
"""
data_by_client_id = {}
for row in data:
client_id = row.get("client_id")
data_by_client_id.setdefault(client_id, [])
data_by_client_id[client_id].append(row)
return data_by_client_id
def get_timezone(date_string):
"""
Get the timezone offset from a Glean timestamp.
"""
return date_string[-6:]
def get_local_time(date_string):
"""
Get just the local time from the Glean timestamp.
"""
return datetime.datetime.fromisoformat(date_string[:-6])
def get_fractional_hour(dt):
"""
Convert the timestamp to a "fractional hour" (hours since the UNIX epoch)
which is useful for plotting.
"""
return dt.timestamp() / 360.0
def has_timezone_change(client_data):
"""
Determine if the client had a timezone change in their history. These are
excluded from the analysis for now because it's a complicated corner case.
"""
timezones = set()
for entry in client_data:
timezones.add(entry["start_time_tz"])
timezones.add(entry["end_time_tz"])
return len(timezones) > 1
def organize_plot(data):
"""
Organize the data into rows so no two timespans overlap.
"""
rows = []
for entry in data:
# Find the first row will the entry will fit, otherwise, create a new
# row
for row in rows:
if entry["start_time_local"] > row[-1]["end_time_local"]:
row.append(entry)
break
else:
rows.append([entry])
return rows
def draw_line(parent, x1, x2, y1, y2, **kwargs):
"""
Draw an SVG line. It is adjusted so it's length is at least 0.5 pixels,
otherwise it will disappear during rendering.
"""
diff = abs(x2 - x1) - 0.5
if diff < 0:
x1 -= diff / 2.0
x2 += diff / 2.0
attrs = {"x1": str(x1), "x2": str(x2), "y1": str(y1), "y2": str(y2)}
kwargs = dict((k.replace("_", "-"), v) for (k, v) in kwargs.items())
attrs.update(kwargs)
return ET.SubElement(parent, "line", attrs)
def draw_text(parent, x, y, text, **kwargs):
"""
Draw SVG text.
"""
title = kwargs.pop("title", None)
attrs = {"x": str(x), "y": str(y), "font-family": "sans-serif", "font-size": "10px"}
kwargs = dict((k.replace("_", "-"), v) for (k, v) in kwargs.items())
attrs.update(kwargs)
el = ET.SubElement(parent, "text", attrs)
el.text = text
if title is not None:
title_el = ET.SubElement(el, "title")
title_el.text = title
return el
def plot_timeline(client_id, data, metrics_rows, baseline_rows):
"""
Make the SVG timeline.
"""
data = sorted(data, key=lambda x: x["start_time_hour"])
# Find the date range to determine the size of the plot
min_time = data[0]["start_time_hour"]
max_time = max(ping["end_time_hour"] for ping in data)
width = max_time - min_time
height = (len(metrics_rows) + len(baseline_rows) + 2) * ROW_HEIGHT
svg = ET.Element(
"svg",
{
"version": "1.1",
"width": str(width),
"height": str(height),
},
)
ET.SubElement(
svg,
"rect",
{
"x": "0",
"y": "0",
"width": str(width),
"height": str(height),
"fill": "white",
},
)
# Draw vertical lines at midnight and 4am, with the date indicated
dt = data[0]["start_time_local"].replace(hour=0, minute=0, second=0)
while get_fractional_hour(dt) < max_time:
x = get_fractional_hour(dt) - min_time
draw_line(svg, x, x, 0, height, stroke="#cccccc")
draw_text(svg, x + 2, height - 2, dt.strftime("%m-%d"))
four = dt.replace(hour=4)
x = get_fractional_hour(four) - min_time
draw_line(svg, x, x, 0, height, stroke="#cccccc", stroke_dasharray="2,1")
dt += datetime.timedelta(days=1)
# Draw markers for the first time key "FIX" versions were seen in the ping metadata
fixes = list(enumerate(FIXES))
for ping in sorted(data, key=lambda x: x["end_time_local"]):
if ping["version_date"] is not None and ping["version_date"] >= fixes[0][1][1]:
x = ping["end_time_hour"] - min_time
draw_line(svg, x, x, 0, height, stroke="#33aa33")
draw_text(svg, x + 2, 12, str(fixes[0][0] + 1), title=fixes[0][1][0])
fixes.pop(0)
if len(fixes) == 0:
break
# Draw the actual pings in the timeline
y = ROW_HEIGHT
for (rows, color) in ((baseline_rows, "#000088"), (metrics_rows, "#880000")):
for row in rows[::-1]:
for ping in row:
draw_line(
svg,
ping["start_time_hour"] - min_time,
ping["end_time_hour"] - min_time,
y,
y,
stroke=color,
stroke_width="0.5",
)
if ping["ping_type"] == "baseline" and ping["duration"]:
session_start = (
get_fractional_hour(
ping["end_time_local"]
- datetime.timedelta(seconds=int(ping["duration"]))
)
- min_time
)
draw_line(
svg,
session_start,
ping["end_time_hour"] - min_time,
y,
y,
stroke=color,
stroke_width="3",
)
if ping["notes"]:
x = 0
for note in sorted(list(ping["notes"])):
draw_text(
svg,
ping["end_time_hour"] - min_time + 2 + x,
y + 3,
str(note),
font_size="6px",
title=NOTE_SUMMARIES[note],
)
x += 8
y += ROW_HEIGHT
draw_text(svg, 2, 12, f"Android SDK: {data[0]['sdk']}")
tree = ET.ElementTree(svg)
with open(f"{client_id}.svg", "wb") as fd:
tree.write(fd)
def find_issues(client_data, stats):
"""
Find and notate issues for a client's data.
"""
client_data = sorted(client_data, key=lambda x: (x["end_time_local"], x["seq"]))
last_ping = None
last_by_type = {}
client_stats = {}
for ping in client_data:
# Find zero-length pings
if (
ping["ping_type"] == "metrics"
and ping["start_time_local"] == ping["end_time_local"]
):
ping["notes"].add(ZERO_LENGTH)
# Find multiple pings with the same end_time
if last_ping is not None:
if ping["ping_type"] == "metrics" and last_ping["ping_type"] == "metrics":
if ping["end_time_local"] == last_ping["end_time_local"]:
ping["notes"].add(DUPLICATE_TIME)
last_ping["notes"].add(DUPLICATE_TIME)
else:
ping["notes"].add(NO_BASELINE)
# Find missing or duplicate seq numbers
last_of_same_type = last_by_type.get(ping["ping_type"])
if last_of_same_type is not None:
if int(last_of_same_type["seq"]) + 1 != int(ping["seq"]):
ping["notes"].add(MISSING_SEQ)
elif int(last_of_same_type["seq"]) == int(ping["seq"]):
ping["notes"].add(DUPLICATE_SEQ)
last_of_same_type["notes"].add(DUPLICATE_SEQ)
if ping["ping_type"] == "metrics":
# Find metrics pings that are more than 24 hours after the last baseline ping
last_baseline = last_by_type.get("baseline")
if last_baseline is not None and ping["end_time_local"] > last_baseline[
"end_time_local"
] + datetime.timedelta(days=1):
ping["notes"].add(TOO_LATE)
# Find metrics pings that are more than +/-1 hour from 4am
if abs(
ping["end_time_local"]
- ping["end_time_local"].replace(hour=4, minute=0, second=0)
) > datetime.timedelta(hours=1):
ping["notes"].add(FAR_FROM_4AM)
# Add notes to the overall client stats
for note in ping["notes"]:
client_stats.setdefault(note, 0)
client_stats[note] += 1
last_ping = ping
last_by_type[ping["ping_type"]] = ping
# Add client stats to the overall stats
for note in client_stats.keys():
stats.setdefault(note, 0)
stats[note] += 1
return client_stats
def process_single_client(client_id, client_data, stats):
"""
Process a single client, performing the analysis and writing out a plot.
"""
if has_timezone_change(client_data):
stats["changed_timezones"] += 1
return {"changed_timezones": True}
client_stats = find_issues(client_data, stats)
client_data.sort(key=lambda x: x["start_time_local"])
metrics_rows = organize_plot(x for x in client_data if x["ping_type"] == "metrics")
baseline_rows = organize_plot(
x for x in client_data if x["ping_type"] == "baseline"
)
plot_timeline(client_id, client_data, metrics_rows, baseline_rows)
return client_stats
def analyse_by_day(data):
"""
Find the "issues" notated in the `notes` field on each ping and generate
a graph of their frequencies over time.
"""
data_by_day = {}
for ping in data:
if ping["ping_type"] == "metrics":
day = ping["end_time_local"].replace(hour=0, minute=0, second=0)
data_by_day.setdefault(day, {})
day_data = data_by_day[day]
day_data.setdefault("total", 0)
day_data["total"] += 1
for note in ping["notes"]:
day_data.setdefault(note, 0)
day_data[note] += 1
for i, fix in enumerate(FIXES):
if ping["version_date"] is not None and ping["version_date"] >= fix[1]:
fix_id = f"fix{i}"
day_data.setdefault(fix_id, 0)
day_data[fix_id] += 1
# Trim the first and last couple of days, since they aren't meaningful
data_by_day = sorted(list(data_by_day.items()))[2:-2]
return data_by_day
def plot_summary(data_by_day, output_filename="summary.svg"):
"""
Plot the summary of issues by day.
"""
dates = [x[0] for x in data_by_day]
plt.figure(figsize=(20, 20))
plt.subplot(211)
plt.title("Frequency of notes by day")
for note in range(1, MAX_NOTES):
note_values = [x[1].get(note, 0) / float(x[1]["total"]) for x in data_by_day]
plt.plot(dates, note_values, label=NOTE_SUMMARIES[note])
plt.legend()
plt.grid()
plt.subplot(212)
plt.title("Uptake of fixes by day")
for i, fix in enumerate(FIXES):
fix_values = [
x[1].get(f"fix{i}", 0) / float(x[1]["total"]) for x in data_by_day
]
plt.plot(dates, fix_values, label=fix[0])
plt.legend()
plt.grid()
plt.savefig(output_filename)
def main(input, output):
data = load_data(input)
data = filter_data(data)
annotate_data(data)
data_by_client_id = sort_data_by_client_id(data)
if not os.path.isdir(output):
os.makedirs(output)
os.chdir(output)
stats = {
"total_clients": len(data_by_client_id),
"changed_timezones": 0,
}
client_stats = {}
for i, (client_id, client_data) in enumerate(data_by_client_id.items()):
print(f"Analysing client: {i}/{len(data_by_client_id)}", end="\r")
client_stats[client_id] = process_single_client(client_id, client_data, stats)
plot_summary(analyse_by_day(data))
print(stats)
if __name__ == "__main__":
# Parse commandline arguments
parser = argparse.ArgumentParser("Analyse patterns in baseline and metrics pings")
parser.add_argument("input", nargs=1, help="The input dataset (in csv)")
parser.add_argument("output", nargs=1, help="The output directory")
args = parser.parse_args()
input = args.input[0]
output = args.output[0]
main(input, output)