Revision control

Copy as Markdown

# Dummy URLs
The data in /components/places/fixtures/dummy_urls.json is about 2000 urls that might appear in someone's history. It's for use with benchmarks.
There are a small number of duplicates, but the observation API makes this not a problem.
It was generated by scraping all entries on the first 15 pages of https://news.ycombinator.com on 1/31/2019, and the top entries from both https://old.reddit.com/r/firefox and https://old.reddit.com/r/firefox as of the same date.
The following script was used, sort of:
```js
let results = new Map();
const LIMIT = 5000;
const PAGE_FETCH_DELAY = 1000;
async function getYC(page = 0) {
console.log(`YC Page ${page}`);
let url = page == 0 ?
let resp = await fetch(url);
if (!resp.ok) {
console.error(`YC: error ${resp.status} (${resp.statusText}) on page ${page}.`, resp);
console.error(await resp.text());
return;
}
let text = await resp.text();
let p = new DOMParser();
let dom = p.parseFromString(text, "text/html");
let links = dom.querySelectorAll("a.storylink");
if (links.length == 0) {
console.log("YC: No links found. Guess we're done");
return;
}
for (let link of links) {
results.set(link.href, { url: link.href, title: link.textContent });
}
if (results.size < LIMIT) {
await new Promise(resolve => setTimeout(resolve, PAGE_FETCH_DELAY));
return getYC(page + 1);
}
}
async function getReddit(url) {
console.log(`Reddit from ${url} (have ${results.size} entries)`);
let resp = await fetch(url);
if (!resp.ok) {
console.error(`Reddit: error ${resp.status} (${resp.statusText}) on page ${page}.`, resp);
console.error(await resp.text());
return;
}
let text = await resp.text();
let p = new DOMParser();
let dom = p.parseFromString(text, "text/html");
// Self and promoted links have weird urls.
let links = dom.querySelectorAll(".thing.link:not(.self):not(.promoted) a.title");
if (links.length == 0) {
console.warn("reddit: No links found. Guess we're done");
return;
}
for (let link of links) {
results.set(link.href, { url: link.href, title: link.textContent });
}
if (results.size < LIMIT) {
// The next url is weird, find it from the button.
let next = dom.querySelector(".next-button a");
if (next == null) {
console.log("Reddit: Failed to find next button, guess we're done");
return;
}
let href = next.href;
await new Promise(resolve => setTimeout(resolve, PAGE_FETCH_DELAY));
return getReddit(href);
}
}
```
This doesn't have code to actually execute anything, as you can't call getReddit and getYC from the same page (due to CSP directives).
I then manually concatenated the output of `temp0.map(v => JSON.stringify(v).join(',\n'))` and wrapped it in square brackets.
This is cludgey and it would have been better to scrape it from node or something, but it only needed to be run once, so whatever.