Source code
Revision control
Copy as Markdown
Other Tools
//! Parsing functions and collections for handling with multiple filter rules.
use std::convert::TryFrom;
use crate::filters::cosmetic::{CosmeticFilter, CosmeticFilterError};
use crate::filters::network::{NetworkFilter, NetworkFilterError};
use crate::resources::PermissionMask;
use itertools::{Either, Itertools};
use memchr::memchr as find_char;
use serde::{Deserialize, Serialize};
use thiserror::Error;
/// Specifies rule types to keep during parsing.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
pub enum RuleTypes {
#[default]
All,
NetworkOnly,
CosmeticOnly,
}
impl RuleTypes {
pub fn loads_network_rules(&self) -> bool {
matches!(self, Self::All | Self::NetworkOnly)
}
pub fn loads_cosmetic_rules(&self) -> bool {
matches!(self, Self::All | Self::CosmeticOnly)
}
}
/// Options for tweaking how a filter or list of filters is interpreted when parsing. It's
/// recommended to use _struct update syntax_ with a `default()` "rest" value; adding new fields to
/// this struct will not be considered a breaking change.
///
/// ```
/// # use adblock::lists::{FilterFormat, ParseOptions};
/// let parse_options = ParseOptions {
/// format: FilterFormat::Hosts,
/// ..ParseOptions::default()
/// };
/// ```
#[derive(Copy, Clone, Deserialize)]
pub struct ParseOptions {
/// Assume filters are in the given format when parsing. Defaults to `FilterFormat::Standard`.
#[serde(default)]
pub format: FilterFormat,
/// Specifies rule types to keep during parsing. Defaults to `RuleTypes::All`. This can be used
/// to reduce the memory impact of engines that will only be used for cosmetic filtering or
/// network filtering, but not both. It can also be useful for iOS and macOS when exporting to
/// content-blocking syntax, as these platforms limit the number of content blocking rules that
/// can be loaded.
#[serde(default)]
pub rule_types: RuleTypes,
/// Specifies permissions to use when parsing a given filter list. See [`PermissionMask`] for
/// more info.
#[serde(default)]
pub permissions: PermissionMask,
}
impl Default for ParseOptions {
fn default() -> Self {
ParseOptions {
format: FilterFormat::Standard,
rule_types: RuleTypes::All,
permissions: PermissionMask::default(),
}
}
}
/// Manages a set of rules to be added to an [`crate::Engine`].
///
/// To be able to efficiently handle special options like `$badfilter`, and to allow optimizations,
/// all rules must be available when the `Engine` is first created. `FilterSet` allows assembling a
/// compound list from multiple different sources before compiling the rules into an `Engine`.
#[derive(Clone)]
pub struct FilterSet {
debug: bool,
pub(crate) network_filters: Vec<NetworkFilter>,
pub(crate) cosmetic_filters: Vec<CosmeticFilter>,
}
/// Collects metadata for the list by reading just until the first non-comment line.
pub fn read_list_metadata(list: &str) -> FilterListMetadata {
let mut metadata = FilterListMetadata::default();
// uBO only searches within the first 1024 characters; the same optimization can be useful here
let mut cutoff = list.len().min(1024);
while !list.is_char_boundary(cutoff) {
cutoff -= 1;
}
// String slice is safe here because `cutoff` is guaranteed to be a character boundary
for line in list[0..cutoff].lines() {
if line.starts_with('!') {
metadata.try_add(line);
} else if line.starts_with('[') {
continue;
} else {
break;
}
}
metadata
}
impl Default for FilterSet {
/// Equivalent to `FilterSet::new(false)`, or `FilterSet::new(true)` when compiled in test
/// configuration.
fn default() -> Self {
#[cfg(not(test))]
let debug = false;
#[cfg(test)]
let debug = true;
Self::new(debug)
}
}
/// Corresponds to the `expires` field of `FilterListMetadata`.
#[derive(Debug, PartialEq, Serialize)]
pub enum ExpiresInterval {
Hours(u16),
Days(u8),
}
impl TryFrom<&str> for ExpiresInterval {
type Error = ();
fn try_from(v: &str) -> Result<Self, ()> {
const DAYS_MAX: u8 = 14;
const HOURS_MAX: u16 = DAYS_MAX as u16 * 24;
// Extract time amount and unit from str
let mut v_split = v.split(' ');
let amount = v_split.next().ok_or(())?;
let unit = v_split.next().ok_or(())?;
// str::parse::<u16> accepts a leading plus sign, but we explicitly forbid it here
if amount.starts_with('+') {
return Err(());
}
// Only accept values in the range [1, MAX] for values with a matching unit
match unit {
"hour" | "hours" => {
let amount = amount.parse::<u16>().map_err(|_| ())?;
if (1..=HOURS_MAX).contains(&amount) {
return Ok(Self::Hours(amount));
}
}
"day" | "days" => {
let amount = amount.parse::<u8>().map_err(|_| ())?;
if (1..=DAYS_MAX).contains(&amount) {
return Ok(Self::Days(amount));
}
}
_ => (),
}
Err(())
}
}
/// Includes information about any "special comments" as described by
#[derive(Default, Serialize)]
pub struct FilterListMetadata {
/// `! Homepage: http://example.com` - This comment determines which webpage should be linked
/// as filter list homepage.
pub homepage: Option<String>,
/// `! Title: FooList` - This comment sets a fixed title for the filter list. If this comment
/// is present, the user is no longer able to change the title.
pub title: Option<String>,
/// `! Expires: 5 days` - This comment sets the update interval for the filter list. The value
/// can be given in days (e.g. 5 days) or hours (e.g. 8 hours). Any value between 1 hour and 14
/// days is possible. Note that the update will not necessarily happen after this time
/// interval. The actual update time is slightly randomized and depends on some additional
/// factors to reduce server load.
pub expires: Option<ExpiresInterval>,
/// `! Redirect: http://example.com/list.txt` - This comment indicates that the filter list has
/// moved to a new download address. Adblock Plus ignores any file content beyond that comment
/// and immediately tries downloading from the new address. In case of success, the address of
/// the filter list is updated in the settings. This comment is ignored if the new address is
/// the same as the current address, meaning that it can be used to enforce the "canonical"
/// address of the filter list.
pub redirect: Option<String>,
}
impl FilterListMetadata {
/// Attempts to add a line of a filter list to this collection of metadata. Only comment lines
/// with valid metadata content will be added. Previously added information will not be
/// rewritten.
fn try_add(&mut self, line: &str) {
if let Some(kv) = line.strip_prefix("! ") {
if let Some((key, value)) = kv.split_once(": ") {
match key {
"Homepage" if self.homepage.is_none() => {
self.homepage = Some(value.to_string())
}
"Title" if self.title.is_none() => self.title = Some(value.to_string()),
"Expires" if self.expires.is_none() => {
if let Ok(expires) = ExpiresInterval::try_from(value) {
self.expires = Some(expires);
}
}
"Redirect" if self.redirect.is_none() => {
self.redirect = Some(value.to_string())
}
_ => (),
}
}
}
}
}
impl FilterSet {
/// Creates a new `FilterSet`. `debug` specifies whether or not to save information about the
/// original raw filter rules alongside the more compact internal representation. If enabled,
/// this information will be passed to the corresponding `Engine`.
pub fn new(debug: bool) -> Self {
Self {
debug,
network_filters: Vec::new(),
cosmetic_filters: Vec::new(),
}
}
// Used in benchmarks to avoid parsing the rules twice.
#[doc(hidden)]
pub fn new_with_rules(
network_filters: Vec<NetworkFilter>,
cosmetic_filters: Vec<CosmeticFilter>,
debug: bool,
) -> Self {
Self {
debug,
network_filters,
cosmetic_filters,
}
}
/// Adds the contents of an entire filter list to this `FilterSet`. Filters that cannot be
/// parsed successfully are ignored. Returns any discovered metadata about the list of rules
/// added.
pub fn add_filter_list(&mut self, filter_list: &str, opts: ParseOptions) -> FilterListMetadata {
self.add_filters(filter_list.lines(), opts)
}
/// Adds a collection of filter rules to this `FilterSet`. Filters that cannot be parsed
/// successfully are ignored. Returns any discovered metadata about the list of rules added.
pub fn add_filters(
&mut self,
filters: impl IntoIterator<Item = impl AsRef<str>>,
opts: ParseOptions,
) -> FilterListMetadata {
let (metadata, parsed_network_filters, parsed_cosmetic_filters) =
parse_filters_with_metadata(filters, self.debug, opts);
self.network_filters.extend(parsed_network_filters);
self.cosmetic_filters.extend(parsed_cosmetic_filters);
metadata
}
/// Adds the string representation of a single filter rule to this `FilterSet`.
pub fn add_filter(&mut self, filter: &str, opts: ParseOptions) -> Result<(), FilterParseError> {
let filter_parsed = parse_filter(filter, self.debug, opts);
match filter_parsed? {
ParsedFilter::Network(filter) => self.network_filters.push(filter),
ParsedFilter::Cosmetic(filter) => self.cosmetic_filters.push(filter),
}
Ok(())
}
/// Consumes this `FilterSet`, returning an equivalent list of content blocking rules and a
/// corresponding new list containing the `String` representation of all filters that were
/// successfully converted (as `FilterFormat::Standard` rules).
///
/// The list of content blocking rules will be properly ordered to ensure correct behavior of
/// `ignore-previous-rules`-typed rules.
///
/// This function will fail if the `FilterSet` was not created in debug mode.
#[cfg(feature = "content-blocking")]
#[allow(clippy::result_unit_err)]
pub fn into_content_blocking(
self,
) -> Result<(Vec<crate::content_blocking::CbRule>, Vec<String>), ()> {
use crate::content_blocking;
use crate::filters::network::NetworkFilterMaskHelper;
use std::collections::HashSet;
if !self.debug {
return Err(());
}
// Store bad filter id to skip them later.
let mut bad_filter_ids = HashSet::new();
for filter in self.network_filters.iter() {
if filter.is_badfilter() {
bad_filter_ids.insert(filter.get_id_without_badfilter());
}
}
let mut ignore_previous_rules = vec![];
let mut other_rules = vec![];
let mut filters_used = vec![];
self.network_filters.into_iter().for_each(|filter| {
// Don't process bad filter rules or matching bad filter rules.
if bad_filter_ids.contains(&filter.get_id()) || filter.is_badfilter() {
return;
}
let original_rule = *filter
.raw_line
.clone()
.expect("All rules should be in debug mode");
if let Ok(equivalent) = TryInto::<content_blocking::CbRuleEquivalent>::try_into(filter)
{
filters_used.push(original_rule);
equivalent
.into_iter()
.for_each(|cb_rule| match &cb_rule.action.typ {
content_blocking::CbType::IgnorePreviousRules => {
ignore_previous_rules.push(cb_rule)
}
_ => other_rules.push(cb_rule),
});
}
});
let add_fp_document_exception = !filters_used.is_empty();
self.cosmetic_filters.into_iter().for_each(|filter| {
let original_rule = *filter
.raw_line
.clone()
.expect("All rules should be in debug mode");
if let Ok(cb_rule) = TryInto::<content_blocking::CbRule>::try_into(filter) {
filters_used.push(original_rule);
match &cb_rule.action.typ {
content_blocking::CbType::IgnorePreviousRules => {
ignore_previous_rules.push(cb_rule)
}
_ => other_rules.push(cb_rule),
}
}
});
other_rules.extend(ignore_previous_rules);
if add_fp_document_exception {
other_rules.push(content_blocking::ignore_previous_fp_documents());
}
Ok((other_rules, filters_used))
}
}
/// Denotes the format of a particular list resource, which affects how its rules should be parsed.
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum FilterFormat {
/// Rules should be parsed in ABP/uBO-style format.
Standard,
/// Each line consists of an IP address (usually 127.0.0.1 or 0.0.0.0), some whitespace, and a
/// single hostname. This syntax is normally used directly for HOSTS-based adblockers. These
/// rules will be treated equivalently to `"||hostname^"` rules in `Standard` format; the IP
/// addresses will not be used.
///
/// Note that some sources provide a more raw format, where each line consists of just a
/// hostname. This option will also accept that format.
///
/// For this option, `!` is accepted as a comment character at the beginning of a line, and `#`
/// is accepted as a comment character anywhere in a line.
Hosts,
}
/// Default to parsing lists in `Standard` format.
impl Default for FilterFormat {
fn default() -> Self {
Self::Standard
}
}
/// Describes the type of a single filter.
#[derive(Debug, PartialEq)]
pub enum FilterType {
/// A network filter, used for changing the behavior of network requests
Network,
/// A network filter, used for changing the behavior of fetched pages
Cosmetic,
/// Something else that isn't supported
NotSupported,
}
/// Successful result of parsing a single filter rule
pub enum ParsedFilter {
Network(NetworkFilter),
Cosmetic(CosmeticFilter),
}
impl From<NetworkFilter> for ParsedFilter {
fn from(v: NetworkFilter) -> Self {
ParsedFilter::Network(v)
}
}
impl From<CosmeticFilter> for ParsedFilter {
fn from(v: CosmeticFilter) -> Self {
ParsedFilter::Cosmetic(v)
}
}
/// Unsuccessful result of parsing a single filter rule.
#[derive(Debug, Error)]
pub enum FilterParseError {
#[error("network filter error: {0}")]
Network(#[source] NetworkFilterError),
#[error("cosmetic filter error: {0}")]
Cosmetic(#[source] CosmeticFilterError),
#[error("unsupported")]
Unsupported,
#[error("empty")]
Empty,
}
impl From<NetworkFilterError> for FilterParseError {
fn from(v: NetworkFilterError) -> Self {
FilterParseError::Network(v)
}
}
impl From<CosmeticFilterError> for FilterParseError {
fn from(v: CosmeticFilterError) -> Self {
FilterParseError::Cosmetic(v)
}
}
/// Parse a single filter rule
pub fn parse_filter(
line: &str,
debug: bool,
opts: ParseOptions,
) -> Result<ParsedFilter, FilterParseError> {
let filter = line.trim();
if filter.is_empty() {
return Err(FilterParseError::Empty);
}
match opts.format {
FilterFormat::Standard => match (detect_filter_type(filter), opts.rule_types) {
(FilterType::Network, RuleTypes::All | RuleTypes::NetworkOnly) => {
NetworkFilter::parse(filter, debug, opts)
.map(|f| f.into())
.map_err(|e| e.into())
}
(FilterType::Cosmetic, RuleTypes::All | RuleTypes::CosmeticOnly) => {
CosmeticFilter::parse(filter, debug, opts.permissions)
.map(|f| f.into())
.map_err(|e| e.into())
}
_ => Err(FilterParseError::Unsupported),
},
FilterFormat::Hosts => {
// Hosts-style rules can only ever be network rules
if !opts.rule_types.loads_network_rules() {
return Err(FilterParseError::Unsupported);
}
if filter.starts_with('!') {
return Err(FilterParseError::Unsupported);
}
// Discard contents after first `#` character
let filter = if let Some(hash_loc) = find_char(b'#', filter.as_bytes()) {
let filter = &filter[..hash_loc];
let filter = filter.trim();
if filter.is_empty() {
return Err(FilterParseError::Unsupported);
}
filter
} else {
filter
};
// Take the last of at most 2 whitespace separated fields
let mut filter_parts = filter.split_whitespace();
let hostname = match (
filter_parts.next(),
filter_parts.next(),
filter_parts.next(),
) {
(None, None, None) => return Err(FilterParseError::Unsupported),
(Some(hostname), None, None) => hostname,
(Some(_ip), Some(hostname), None) => hostname,
(Some(_), Some(_), Some(_)) => return Err(FilterParseError::Unsupported),
_ => unreachable!(),
};
// Matches in hosts lists are usually redirected to localhost. For that reason, some
// lists include an entry for "localhost", which should be explicitly ignored when
// performing request-level adblocking.
if hostname == "localhost" {
return Err(FilterParseError::Unsupported);
}
NetworkFilter::parse_hosts_style(hostname, debug)
.map(|f| f.into())
.map_err(|e| e.into())
}
}
}
/// Parse an entire list of filters, ignoring any errors
pub fn parse_filters(
list: impl IntoIterator<Item = impl AsRef<str>>,
debug: bool,
opts: ParseOptions,
) -> (Vec<NetworkFilter>, Vec<CosmeticFilter>) {
let (_metadata, network_filters, cosmetic_filters) =
parse_filters_with_metadata(list, debug, opts);
(network_filters, cosmetic_filters)
}
/// Parse an entire list of filters, ignoring any errors
pub fn parse_filters_with_metadata(
list: impl IntoIterator<Item = impl AsRef<str>>,
debug: bool,
opts: ParseOptions,
) -> (FilterListMetadata, Vec<NetworkFilter>, Vec<CosmeticFilter>) {
let mut metadata = FilterListMetadata::default();
let list_iter = list.into_iter();
let (network_filters, cosmetic_filters): (Vec<_>, Vec<_>) = list_iter
.map(|line| {
metadata.try_add(line.as_ref());
parse_filter(line.as_ref(), debug, opts)
})
.filter_map(Result::ok)
.partition_map(|filter| match filter {
ParsedFilter::Network(f) => Either::Left(f),
ParsedFilter::Cosmetic(f) => Either::Right(f),
});
(metadata, network_filters, cosmetic_filters)
}
/// Given a single line, checks if this would likely be a cosmetic filter, a
/// network filter or something that is not supported. This check is performed
/// before calling a more specific parser to create an instance of
/// `NetworkFilter` or `CosmeticFilter`.
fn detect_filter_type(filter: &str) -> FilterType {
// Ignore comments
if filter.len() == 1
|| filter.starts_with('!')
|| (filter.starts_with('#') && filter[1..].starts_with(char::is_whitespace))
|| filter.starts_with("[Adblock")
{
return FilterType::NotSupported;
}
if filter.starts_with('|') || filter.starts_with("@@|") {
return FilterType::Network;
}
// Check if filter is cosmetic
if let Some(sharp_index) = find_char(b'#', filter.as_bytes()) {
let after_sharp_index = sharp_index + 1;
// Check the next few bytes for a second `#`
// Indexing is safe here because it uses the filter's byte
// representation and guards against short strings
if find_char(
b'#',
&filter.as_bytes()[after_sharp_index..(after_sharp_index + 4).min(filter.len())],
)
.is_some()
{
return FilterType::Cosmetic;
}
}
// Ignore Adguard cosmetics
if filter.contains("$$") {
return FilterType::NotSupported;
}
// Everything else is a network filter
FilterType::Network
}
#[cfg(test)]
#[path = "../tests/unit/lists.rs"]
mod unit_tests;