state.rs - mozsearch

mozilla-central/third_party/rust/quick-xml/src/reader/state.rs (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Firefox Build System :: General

Revision control

Copy as Markdown

Other Tools

#[cfg(feature = "encoding")]↩

use encoding_rs::UTF_8;↩

↩

use crate::encoding::Decoder;↩

use crate::errors::{Error, IllFormedError, Result, SyntaxError};↩

use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};↩

#[cfg(feature = "encoding")]↩

use crate::reader::EncodingRef;↩

use crate::reader::{BangType, Config, ParseState};↩

use crate::utils::{is_whitespace, name_len};↩

↩

/// A struct that holds a current reader state and a parser configuration.↩

/// It is independent on a way of reading data: the reader feed data into it and↩

/// get back produced [`Event`]s.↩

#[derive(Clone, Debug)]↩

pub(super) struct ReaderState {↩

    /// Number of bytes read from the source of data since the reader was created↩

    pub offset: u64,↩

    /// A snapshot of an `offset` of the last error returned. It can be less than↩

    /// `offset`, because some errors conveniently report at earlier position,↩

    /// and changing `offset` is not possible, because `Error::IllFormed` errors↩

    /// are recoverable.↩

    pub last_error_offset: u64,↩

    /// Defines how to process next byte↩

    pub state: ParseState,↩

    /// User-defined settings that affect parsing↩

    pub config: Config,↩

    /// All currently Started elements which didn't have a matching↩

    /// End element yet.↩

    ///↩

    /// For an XML↩

    ///↩

    /// ```xml↩

    /// <root><one/><inner attr="value">|<tag></inner></root>↩

    /// ```↩

    /// when cursor at the `|` position buffer contains:↩

    ///↩

    /// ```text↩

    /// rootinner↩

    /// ^   ^↩

    /// ```↩

    ///↩

    /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]↩

    /// (0 and 4 in that case).↩

    opened_buffer: Vec<u8>,↩

    /// Opened name start indexes into [`Self::opened_buffer`]. See documentation↩

    /// for that field for details↩

    opened_starts: Vec<usize>,↩

↩

    #[cfg(feature = "encoding")]↩

    /// Reference to the encoding used to read an XML↩

    pub encoding: EncodingRef,↩

}↩

↩

impl ReaderState {↩

    /// Trims end whitespaces from `bytes`, if required, and returns a text event.↩

    ///↩

    /// # Parameters↩

    /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`↩

    pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> BytesText<'b> {↩

        let mut content = bytes;↩

↩

        if self.config.trim_text_end {↩

            // Skip the ending '<'↩

            let len = bytes↩

                .iter()↩

                .rposition(|&b| !is_whitespace(b))↩

                .map_or(0, |p| p + 1);↩

            content = &bytes[..len];↩

        }↩

        BytesText::wrap(content, self.decoder())↩

    }↩

↩

    /// Returns `Comment`, `CData` or `DocType` event.↩

    ///↩

    /// `buf` contains data between `<` and `>`:↩

    /// - CDATA: `![CDATA[...]]`↩

    /// - Comment: `!--...--`↩

    /// - Doctype (uppercase): `!D...`↩

    /// - Doctype (lowercase): `!d...`↩

    pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {↩

        debug_assert_eq!(↩

            buf.first(),↩

            Some(&b'!'),↩

            "CDATA, comment or DOCTYPE should start from '!'"↩

        );↩

↩

        let uncased_starts_with = |string: &[u8], prefix: &[u8]| {↩

            string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)↩

        };↩

↩

        let len = buf.len();↩

        match bang_type {↩

            BangType::Comment if buf.starts_with(b"!--") => {↩

                debug_assert!(buf.ends_with(b"--"));↩

                if self.config.check_comments {↩

                    // search if '--' not in comments↩

                    let mut haystack = &buf[3..len - 2];↩

                    let mut off = 0;↩

                    while let Some(p) = memchr::memchr(b'-', haystack) {↩

                        off += p + 1;↩

                        // if next byte after `-` is also `-`, return an error↩

                        if buf[3 + off] == b'-' {↩

                            // Explanation of the magic:↩

                            //↩

                            // - `self.offset`` just after `>`,↩

                            // - `buf` contains `!-- con--tent --`↩

                            // - `p` is counted from byte after `<!--`↩

                            //↩

                            // <!-- con--tent -->:↩

                            //  ~~~~~~~~~~~~~~~~ : - buf↩

                            //   : ===========   : - zone of search (possible values of `p`)↩

                            //   : |---p         : - p is counted from | (| is 0)↩

                            //   : :   :         ^ - self.offset↩

                            //   ^ :   :           - self.offset - len↩

                            //     ^   :           - self.offset - len + 2↩

                            //         ^           - self.offset - len + 2 + p↩

                            self.last_error_offset = self.offset - len as u64 + 2 + p as u64;↩

                            return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment));↩

                        }↩

                        // Continue search after single `-` (+1 to skip it)↩

                        haystack = &haystack[p + 1..];↩

                    }↩

                }↩

                Ok(Event::Comment(BytesText::wrap(↩

                    // Cut of `!--` and `--` from start and end↩

                    &buf[3..len - 2],↩

                    self.decoder(),↩

                )))↩

            }↩

            // XML requires uppercase only:↩

            // https://www.w3.org/TR/xml11/#sec-cdata-sect↩

            // Even HTML5 required uppercase only:↩

            // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state↩

            BangType::CData if buf.starts_with(b"![CDATA[") => {↩

                debug_assert!(buf.ends_with(b"]]"));↩

                Ok(Event::CData(BytesCData::wrap(↩

                    // Cut of `![CDATA[` and `]]` from start and end↩

                    &buf[8..len - 2],↩

                    self.decoder(),↩

                )))↩

            }↩

            // XML requires uppercase only, but we will check that on validation stage:↩

            // https://www.w3.org/TR/xml11/#sec-prolog-dtd↩

            // HTML5 allows mixed case for doctype declarations:↩

            // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state↩

            BangType::DocType(0) if uncased_starts_with(buf, b"!DOCTYPE") => {↩

                match buf[8..].iter().position(|&b| !is_whitespace(b)) {↩

                    Some(start) => Ok(Event::DocType(BytesText::wrap(↩

                        // Cut of `!DOCTYPE` and any number of spaces from start↩

                        &buf[8 + start..],↩

                        self.decoder(),↩

                    ))),↩

                    None => {↩

                        // Because we here, we at least read `<!DOCTYPE>` and offset after `>`.↩

                        // We want report error at place where name is expected - this is just↩

                        // before `>`↩

                        self.last_error_offset = self.offset - 1;↩

                        return Err(Error::IllFormed(IllFormedError::MissingDoctypeName));↩

                    }↩

                }↩

            }↩

            _ => {↩

                // <!....>↩

                //  ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`.↩

                // ^------- We report error at that position, so we need to subtract 2 and buf len↩

                self.last_error_offset = self.offset - len as u64 - 2;↩

                Err(bang_type.to_err().into())↩

            }↩

        }↩

    }↩

↩

    /// Wraps content of `buf` into the [`Event::End`] event. Does the check that↩

    /// end name matches the last opened start name if `self.config.check_end_names` is set.↩

    ///↩

    /// `buf` contains data between `<` and `>`, for example `/tag`.↩

    pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {↩

        debug_assert_eq!(↩

            buf.first(),↩

            Some(&b'/'),↩

            "closing tag should start from '/'"↩

        );↩

↩

        // Strip the `/` character. `content` contains data between `</` and `>`↩

        let content = &buf[1..];↩

        // XML standard permits whitespaces after the markup name in closing tags.↩

        // Let's strip them from the buffer before comparing tag names.↩

        let name = if self.config.trim_markup_names_in_closing_tags {↩

            if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) {↩

                &content[..pos_end_name + 1]↩

            } else {↩

                content↩

            }↩

        } else {↩

            content↩

        };↩

↩

        let decoder = self.decoder();↩

↩

        // Get the index in self.opened_buffer of the name of the last opened tag↩

        match self.opened_starts.pop() {↩

            Some(start) => {↩

                if self.config.check_end_names {↩

                    let expected = &self.opened_buffer[start..];↩

                    if name != expected {↩

                        let expected = decoder.decode(expected).unwrap_or_default().into_owned();↩

                        // #513: In order to allow error recovery we should drop content of the buffer↩

                        self.opened_buffer.truncate(start);↩

↩

                        // Report error at start of the end tag at `<` character↩

                        // -2 for `<` and `>`↩

                        self.last_error_offset = self.offset - buf.len() as u64 - 2;↩

                        return Err(Error::IllFormed(IllFormedError::MismatchedEndTag {↩

                            expected,↩

                            found: decoder.decode(name).unwrap_or_default().into_owned(),↩

                        }));↩

                    }↩

                }↩

↩

                self.opened_buffer.truncate(start);↩

            }↩

            None => {↩

                if !self.config.allow_unmatched_ends {↩

                    // Report error at start of the end tag at `<` character↩

                    // -2 for `<` and `>`↩

                    self.last_error_offset = self.offset - buf.len() as u64 - 2;↩

                    return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag(↩

                        decoder.decode(name).unwrap_or_default().into_owned(),↩

                    )));↩

                }↩

            }↩

        }↩

↩

        Ok(Event::End(BytesEnd::wrap(name.into())))↩

    }↩

↩

    /// `buf` contains data between `<` and `>` and the first byte is `?`.↩

    /// `self.offset` already after the `>`↩

    ///↩

    /// Returns `Decl` or `PI` event↩

    pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {↩

        debug_assert!(buf.len() > 0);↩

        debug_assert_eq!(buf[0], b'?');↩

↩

        let len = buf.len();↩

        // We accept at least <??>↩

        //                     ~~ - len = 2↩

        if len > 1 && buf[len - 1] == b'?' {↩

            // Cut of `?` and `?` from start and end↩

            let content = &buf[1..len - 1];↩

            let len = content.len();↩

↩

            if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) {↩

                let event = BytesDecl::from_start(BytesStart::wrap(content, 3));↩

↩

                // Try getting encoding from the declaration event↩

                #[cfg(feature = "encoding")]↩

                if self.encoding.can_be_refined() {↩

                    if let Some(encoding) = event.encoder() {↩

                        self.encoding = EncodingRef::XmlDetected(encoding);↩

                    }↩

                }↩

↩

                Ok(Event::Decl(event))↩

            } else {↩

                Ok(Event::PI(BytesPI::wrap(content, name_len(content))))↩

            }↩

        } else {↩

            // <?....EOF↩

            //  ^^^^^ - `buf` does not contains `<`, but we want to report error at `<`,↩

            //          so we move offset to it (-2 for `<` and `>`)↩

            self.last_error_offset = self.offset - len as u64 - 2;↩

            Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl))↩

        }↩

    }↩

↩

    /// Converts content of a tag to a `Start` or an `Empty` event↩

    ///↩

    /// # Parameters↩

    /// - `content`: Content of a tag between `<` and `>`↩

    pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Event<'b> {↩

        if let Some(content) = content.strip_suffix(b"/") {↩

            // This is self-closed tag `<something/>`↩

            let event = BytesStart::wrap(content, name_len(content));↩

↩

            if self.config.expand_empty_elements {↩

                self.state = ParseState::InsideEmpty;↩

                self.opened_starts.push(self.opened_buffer.len());↩

                self.opened_buffer.extend(event.name().as_ref());↩

                Event::Start(event)↩

            } else {↩

                Event::Empty(event)↩

            }↩

        } else {↩

            let event = BytesStart::wrap(content, name_len(content));↩

↩

            // #514: Always store names event when .check_end_names == false,↩

            // because checks can be temporary disabled and when they would be↩

            // enabled, we should have that information↩

            self.opened_starts.push(self.opened_buffer.len());↩

            self.opened_buffer.extend(event.name().as_ref());↩

            Event::Start(event)↩

        }↩

    }↩

↩

    #[inline]↩

    pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> {↩

        self.state = ParseState::InsideText;↩

        let name = self↩

            .opened_buffer↩

            .split_off(self.opened_starts.pop().unwrap());↩

        BytesEnd::wrap(name.into())↩

    }↩

↩

    /// Get the decoder, used to decode bytes, read by this reader, to the strings.↩

    ///↩

    /// If [`encoding`] feature is enabled, the used encoding may change after↩

    /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.↩

    ///↩

    /// If [`encoding`] feature is enabled and no encoding is specified in declaration,↩

    /// defaults to UTF-8.↩

    ///↩

    /// [`encoding`]: ../../index.html#encoding↩

    pub const fn decoder(&self) -> Decoder {↩

        Decoder {↩

            #[cfg(feature = "encoding")]↩

            encoding: self.encoding.encoding(),↩

        }↩

    }↩

}↩

↩

impl Default for ReaderState {↩

    fn default() -> Self {↩

        Self {↩

            offset: 0,↩

            last_error_offset: 0,↩

            state: ParseState::Init,↩

            config: Config::default(),↩

            opened_buffer: Vec::new(),↩

            opened_starts: Vec::new(),↩

↩

            #[cfg(feature = "encoding")]↩

            encoding: EncodingRef::Implicit(UTF_8),↩

        }↩

    }↩

}↩