Source code
Revision control
Copy as Markdown
Other Tools
Test Info:
/* Any copyright is dedicated to the Public Domain.
"use strict";
/**
* Tests for custom DOM extraction strategies for search engine result pages.
* These tests verify that the DOMExtractor applies site-specific extraction
* rules when a sourceUrl option is provided.
*/
const SAMPLE_HTML = `<div class="MjjYud">
<div class="A6K0A" data-rpos="12">
<div ...>
<div class="N54PNb BToiNc" data-snc="auw0Ab">
<div class="kb0PBd A9Y9g jGGQ5e" data-snf="x5WNvb" data-snhf="0">
<div class="yuRUbf">
<div class="b8lM7">
<span class="V9tjod" jsaction="trigger.mLt3mc">
<h3>
Choose which Firefox Browser to download in your language
</h3>
<br />
<div class="notranslate ESMNde HGLrXd ojE3Fb">
<div class="q0vns">
<span class="DDKf1c"><div class="eqA2re UnOTSe Vwoesf" aria-hidden="true">
<img .../></div
></span>
<div class="CA5RN">
<div><span class="VuuXrf">Firefox</span></div>
<div class="byrV5b">
</div>
</div>
</div>
</div>
</a>
</span>
<div class="B6fmyf byrV5b Mg1HEd">
<div class="HGLrXd ojE3Fb">
<div class="q0vns">
<span class="DDKf1c"><div...></div></span>
<div class="CA5RN">
<div><span class="VuuXrf">Firefox</span></div>
<div class="byrV5b">
<cite ... role="text">https://www.firefox.com<span ... role="text">› en-US › download › all</span></cite>
</div>
</div>
</div>
</div>
<div class="csDOgf BCF2pd ezY6nb L48a4c">
<div ...>
<div ...>
<span class="D6lY4c"><span ...><svg>...</svg></span></span>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<div ...>
<div ...>
<span
><em>Choose which Firefox Browser to download in your language</em
>. Everyone deserves access to the internet — your language should
never be a barrier.</span
>
</div>
</div>
</div>
</div>
</div>
</div>`;
/**
* Test extraction without sourceUrl option (default behavior).
*/
add_task(async function test_extraction_without_page_url() {
const { actor, cleanup } = await html([SAMPLE_HTML]);
const result = await actor.getText({});
const expected = [
"Choose which Firefox Browser to download in your language",
"Firefox",
"Firefox",
"Choose which Firefox Browser to download in your language. Everyone deserves access to the internet — your language should never be a barrier.",
].join("\n");
is(
result.text,
expected,
"Without a sourceUrl, the default strategy should preserve cite breadcrumbs and should not format block anchors as markdown"
);
return cleanup();
});
/**
* Test that the Google search extraction strategy matches various Google domains.
*/
add_task(async function test_google_search_domain_matching() {
const { actor, cleanup } = await html([SAMPLE_HTML]);
const googleDomains = [
];
const expected = [
"[Choose which Firefox Browser to download in your language](https://www.firefox.com/en-US/download/all/)",
"Firefox",
"Firefox",
"Choose which Firefox Browser to download in your language. Everyone deserves access to the internet — your language should never be a barrier.",
].join("\n");
for (const url of googleDomains) {
const result = await actor.getText({ sourceUrl: url });
is(
result.text,
expected,
`Google strategy should be applied (cite excluded, block anchor formatted as markdown) for ${url}`
);
}
return cleanup();
});
/**
* Test that non-Google URLs do not trigger the google search extraction strategy
* should preserve cite elements and not format block anchors as markdown links.
*/
add_task(async function test_non_google_sites_preserve_default_strategy() {
const { actor, cleanup } = await html([SAMPLE_HTML]);
const nonGoogleUrls = [
// other search engines
// google subdomains
// other google pages
// edge cases
"google.com/search?q=test",
"www.google.com/search?q=test",
];
const expected = [
"Choose which Firefox Browser to download in your language",
"Firefox",
"Firefox",
"Choose which Firefox Browser to download in your language. Everyone deserves access to the internet — your language should never be a barrier.",
].join("\n");
for (const url of nonGoogleUrls) {
const result = await actor.getText({ sourceUrl: url });
is(
result.text,
expected,
`Default strategy should be applied (cite included, block anchor not formatted as markdown) for ${url}`
);
}
return cleanup();
});
/**
* Test that Google search strategy handles multiple cite elements.
*/
add_task(
async function test_google_search_filter_selector_removes_cite_elements() {
const { actor, cleanup } = await html`
<div>
<div class="result">
<h3>Result 1 Title</h3>
<p>First result description.</p>
</a>
</div>
<div class="result">
<h3>Result 2 Title</h3>
<p>Second result description.</p>
</a>
</div>
<div class="result">
<h3>Result 3 Title</h3>
<p>Third result description.</p>
</a>
</div>
</div>
`;
const result = await actor.getText({
});
const expected = [
"First result description.",
"Second result description.",
"Third result description.",
].join("\n");
is(
result.text,
expected,
"Cite elements should be removed and block anchors should be formatted as markdown"
);
// Links should still be captured
Assert.deepEqual(
result.links,
[
],
"Links should be extracted from search results"
);
return cleanup();
}
);
/**
* Test that the filter selector removes matches that are not inside any anchor,
* so removal is independent of block-anchor markdown formatting.
*/
add_task(async function test_google_search_filter_selector_outside_anchor() {
const { actor, cleanup } = await html`
<div>
<p>Preamble text.</p>
<p>Following description.</p>
</div>
`;
const result = await actor.getText({
});
const expected = ["Preamble text.", "Following description."].join("\n");
is(
result.text,
expected,
"Cite outside any anchor should be removed; surrounding text should be preserved"
);
return cleanup();
});
/**
* Test that anchors wrapping block content are formatted as markdown links
* with cite elements excluded from the link text.
*/
add_task(async function test_google_search_markdown_deduped_per_block() {
const { actor, cleanup } = await html`
<div>
<div>
<h3>Article Title</h3>
</div>
<p>Shallow paragraph.</p>
<div>
<div>
<p>Deeply nested paragraph.</p>
</div>
</div>
</a>
</div>
`;
const result = await actor.getText({
});
const expected = [
"Shallow paragraph.",
"Deeply nested paragraph.",
].join("\n");
is(
result.text,
expected,
"Block anchor wrapping descendants at varying depths should be formatted as markdown exactly once"
);
return cleanup();
});
/**
* Test that block anchors WITHOUT cite elements are NOT formatted as markdown
* even on Google search pages.
*/
add_task(async function test_google_search_block_links_with_selector() {
const { actor, cleanup } = await html`
<div>
<h3>Has Cite Title</h3>
<p>Description with cite.</p>
</a>
<h3>No Cite Title</h3>
<p>Description without cite.</p>
</a>
</div>
`;
const result = await actor.getText({
});
const expected = [
"Description with cite.",
"No Cite Title",
"Description without cite.",
].join("\n");
is(
result.text,
expected,
"Block anchors with a cite descendant should be formatted as markdown; without a cite they shouldn't be formatted as markdown"
);
return cleanup();
});
/**
* Test that inline anchors within paragraph text are formatted as markdown links.
* Unlike block anchors, inline link formatting applies regardless of sourceUrl.
*/
add_task(async function test_inline_links_formatted_as_markdown() {
const { actor, cleanup } = await html`
<p>
information.
</p>
`;
const expected =
for (const url of [
undefined,
]) {
const result = await actor.getText({ sourceUrl: url });
is(
result.text,
expected,
`Inline anchor should be formatted as markdown for sourceUrl=${url}`
);
}
await cleanup();
});
/**
* Test that multiple inline anchors within the same block are all formatted as markdown.
*/
add_task(async function test_multiple_inline_links_in_block() {
const { actor, cleanup } = await html`
<p>
</p>
`;
const result = await actor.getText({
});
const expected =
is(
result.text,
expected,
"Multiple inline anchors in the same block should each be formatted as markdown"
);
return cleanup();
});