From 0e1552eeaa40c61b84d09e4231f2809602b75e57 Mon Sep 17 00:00:00 2001 From: asonix Date: Sun, 31 Jan 2021 13:50:34 -0600 Subject: [PATCH] Content: Use own bbcode impl This gives us more control over things, like automatic 'linkifying', and the ability to add custom user tagging logic --- content/Cargo.toml | 3 +- content/src/bbcode.rs | 594 ++++++++++++++++++++++++++++++++++++++++++ content/src/color.rs | 43 +++ content/src/email.rs | 127 +++++++++ content/src/handle.rs | 68 +++++ content/src/lib.rs | 62 +++-- content/src/render.rs | 483 ++++++++++++++++++++++++++++++++++ content/src/url.rs | 401 ++++++++++++++++++++++++++++ 8 files changed, 1761 insertions(+), 20 deletions(-) create mode 100644 content/src/bbcode.rs create mode 100644 content/src/color.rs create mode 100644 content/src/email.rs create mode 100644 content/src/handle.rs create mode 100644 content/src/render.rs create mode 100644 content/src/url.rs diff --git a/content/Cargo.toml b/content/Cargo.toml index f9f55ed..3bbcdb0 100644 --- a/content/Cargo.toml +++ b/content/Cargo.toml @@ -8,6 +8,7 @@ edition = "2018" [dependencies] ammonia = "3.1.0" -bbclash = "1.1.1" +combine = "4.5.2" log = "0.4" once_cell = "1.5.2" +thiserror = "1" diff --git a/content/src/bbcode.rs b/content/src/bbcode.rs new file mode 100644 index 0000000..c598cae --- /dev/null +++ b/content/src/bbcode.rs @@ -0,0 +1,594 @@ +use crate::{ + color::color, + email::email, + handle::handle, + url::{url, Url}, +}; +use combine::{ + attempt, between, choice, error::StreamError, look_ahead, many, many1, parser, satisfy, + stream::StreamErrorFor, token, value, Parser, Stream, +}; + +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub enum Tag { + Right, + Center, + Color, + Quote, + Code, + Codeblock, + Pre, + Mono, + Sub, + Sup, + S, + Spoiler, + Bold, + Strong, + I, + Em, + U, + Smcaps, + IconText, + Icon, + Hr, + Url, +} + +#[derive(Clone, Debug)] +pub(crate) enum Node { + TagNode { + tag: Tag, + attr: Option, + children: Vec, + }, + UrlNode { + url: Url, + }, + HandleNode { + handle: String, + domain: String, + }, + EmailNode { + email: String, + }, + CharNode { + text: char, + }, + NewlineNode, +} + +#[derive(Clone, Debug, thiserror::Error)] +#[error("Invalid tag: {0}")] +struct TagError(String); + +#[derive(Clone, Debug, thiserror::Error)] +#[error("Backing out due to found closing tag")] +struct ClosingTagBackout; + +impl Tag { + fn needs_closing(&self) -> bool { + !matches!(self, Tag::Hr) + } + + fn with_attribute(&self, attribute: &Option) -> Option { + let attr = attribute.as_deref()?; + + match self { + Tag::Color => color().parse(attr).ok().map(|color| color.0), + Tag::Url => url().parse(attr).ok().map(|url| url.0.to_string()), + _ => None, + } + } +} + +fn tag_string() -> impl Parser +where + Input: Stream, +{ + many1(satisfy(|c| c != ']')) +} + +fn tag() -> impl Parser)> +where + Input: Stream, +{ + tag_string().and_then(|full_tag| { + let mut iter = full_tag.split('='); + let tag_name = iter + .next() + .ok_or_else(|| StreamErrorFor::::other(TagError(full_tag.clone())))?; + + let tag = match tag_name { + "right" => Tag::Right, + "center" => Tag::Center, + "quote" => Tag::Quote, + "color" => Tag::Color, + "code" => Tag::Code, + "codeblock" => Tag::Codeblock, + "pre" => Tag::Pre, + "mono" => Tag::Mono, + "sub" => Tag::Sub, + "sup" => Tag::Sup, + "s" => Tag::S, + "spoiler" => Tag::Spoiler, + "bold" => Tag::Bold, + "strong" => Tag::Strong, + "i" => Tag::I, + "em" => Tag::Em, + "u" => Tag::U, + "smcaps" => Tag::Smcaps, + "icontext" => Tag::IconText, + "icon" => Tag::Icon, + "hr" => Tag::Hr, + "url" => Tag::Url, + _ => { + return Err(StreamErrorFor::::other(TagError( + tag_name.to_owned(), + ))) + } + }; + + let attribute = iter.next().map(|s| s.to_owned()); + + Ok((tag, attribute)) + }) +} + +fn closing_tag(tag: Tag) -> impl Parser +where + Input: Stream, +{ + between(token('['), token(']'), tag_string()).and_then(move |closing_tag| { + if closing_tag == format!("/{}", tag) { + Ok(()) + } else { + Err(StreamErrorFor::::other(TagError(closing_tag))) + } + }) +} + +fn openening_tag() -> impl Parser)> +where + Input: Stream, +{ + between(token('['), token(']'), tag()) +} + +fn url_inner(attr: Option) -> impl Parser +where + Input: Stream, +{ + let tag = Tag::Url; + if let Some(attr) = tag.with_attribute(&attr) { + tag_body(tag, Some(attr)).left() + } else { + url_node() + .map(move |node| Node::TagNode { + tag, + attr: None, + children: vec![node], + }) + .skip(closing_tag(tag)) + .right() + } +} + +fn icon_text_inner() -> impl Parser +where + Input: Stream, +{ + handle_node() + .skip(closing_tag(Tag::IconText)) + .map(|node| Node::TagNode { + tag: Tag::IconText, + attr: None, + children: vec![node], + }) +} + +fn icon_inner() -> impl Parser +where + Input: Stream, +{ + handle_node() + .skip(closing_tag(Tag::Icon)) + .map(|node| Node::TagNode { + tag: Tag::Icon, + attr: None, + children: vec![node], + }) +} + +fn tag_body(tag: Tag, attr: Option) -> impl Parser +where + Input: Stream, +{ + node_vec(Some(tag)) + .skip(closing_tag(tag)) + .map(move |children| Node::TagNode { + tag, + attr: tag.with_attribute(&attr), + children, + }) +} + +fn singleton_tag(tag: Tag) -> impl Parser +where + Input: Stream, +{ + value(Node::TagNode { + tag, + attr: None, + children: vec![], + }) +} + +fn tag_node() -> impl Parser +where + Input: Stream, +{ + openening_tag().then(|(tag, attr)| { + if tag.needs_closing() { + match tag { + Tag::Url => url_inner(attr).left().left().left(), + Tag::IconText => icon_text_inner().right().left().left(), + Tag::Icon => icon_inner().left().right().left(), + _ => tag_body(tag, attr).right().right().left(), + } + } else { + singleton_tag(tag).right() + } + }) +} + +fn handle_node() -> impl Parser +where + Input: Stream, +{ + handle().map(|handle| Node::HandleNode { + handle: handle.handle, + domain: handle.domain, + }) +} + +fn email_node() -> impl Parser +where + Input: Stream, +{ + email().map(|email| Node::EmailNode { email }) +} + +fn url_node() -> impl Parser +where + Input: Stream, +{ + url().map(|url| Node::UrlNode { url }) +} + +fn valid_char() -> impl Parser +where + Input: Stream, +{ + satisfy(|c| c != '\n') +} + +fn char_node(closing: Option) -> impl Parser +where + Input: Stream, +{ + if let Some(tag) = closing { + look_ahead(closing_tag(tag)) + .map(|_| None) + .or(valid_char().map(Some)) + .and_then(|text: Option| { + if let Some(text) = text { + Ok(Node::CharNode { text }) + } else { + Err(StreamErrorFor::::other(ClosingTagBackout)) + } + }) + .left() + } else { + valid_char().map(|text| Node::CharNode { text }).right() + } +} + +fn newline_node() -> impl Parser +where + Input: Stream, +{ + many1(combine::parser::char::char('\n')).map(|_: String| Node::NewlineNode) +} + +fn single_node(closing: Option) -> impl Parser +where + Input: Stream, +{ + choice(( + attempt(tag_node()), + attempt(handle_node()), + attempt(email_node()), + attempt(url_node()), + char_node(closing), + newline_node(), + )) +} + +fn node_vec_(closing: Option) -> impl Parser> +where + Input: Stream, +{ + many(single_node(closing)) +} + +parser! { + pub(crate) fn node_vec[Input](closing: Option)(Input) -> Vec + where [Input: Stream] + { + node_vec_(*closing) + } +} + +impl std::fmt::Display for Tag { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let s = match self { + Tag::Right => "right", + Tag::Center => "center", + Tag::Quote => "quote", + Tag::Color => "color", + Tag::Code => "code", + Tag::Codeblock => "codeblock", + Tag::Pre => "pre", + Tag::Mono => "mono", + Tag::Sub => "sub", + Tag::Sup => "sup", + Tag::S => "s", + Tag::Spoiler => "spoiler", + Tag::Bold => "bold", + Tag::Strong => "strong", + Tag::I => "i", + Tag::Em => "em", + Tag::U => "u", + Tag::Smcaps => "smcaps", + Tag::IconText => "icontext", + Tag::Icon => "icon", + Tag::Hr => "hr", + Tag::Url => "url", + }; + + write!(f, "{}", s) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use combine::EasyParser; + + #[test] + fn parse_closing_tag() { + let (_, rest) = closing_tag(Tag::Right).easy_parse("[/right]").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_tag() { + let (_, rest) = tag().easy_parse("right").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_right_node() { + let (_, rest) = tag_node().easy_parse("[right][/right]").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_nested_node() { + let (node, rest) = tag_node() + .easy_parse("[center][bold][/bold][/center]") + .unwrap(); + assert_eq!(rest, ""); + match node { + Node::TagNode { tag, children, .. } => { + assert_eq!(tag, Tag::Center); + assert_eq!(children.len(), 1); + } + _ => panic!("Invalid node type"), + } + } + + #[test] + fn parse_multiple_nodes() { + let (vec, rest) = node_vec(None) + .easy_parse("[center][/center][right][/right]") + .unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 2); + } + + #[test] + fn parse_plaintext() { + let input = "this is a plaintext string"; + let (vec, rest) = node_vec(None).easy_parse(input).unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), input.len()); + } + + #[test] + fn parse_text_with_bracket() { + let input = "plaintext [ but with an open bracket"; + let (vec, rest) = node_vec(None).easy_parse(input).unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), input.len()); + } + + #[test] + fn parse_text_with_bad_tag() { + let input = "bad tag [here] is parsed fine"; + let (vec, rest) = node_vec(None).easy_parse(input).unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), input.len()); + } + + #[test] + fn parse_url() { + let (vec, rest) = node_vec(None) + .easy_parse("https://example.com:80/path?query#fragment") + .unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 1); + } + + #[test] + fn parse_string_with_url() { + let (vec, rest) = node_vec(None) + .easy_parse("hello http://example.com world") + .unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 13); + } + + #[test] + fn parse_url_tag() { + let (vec, rest) = node_vec(None) + .easy_parse("[url=http://example.com]hey there[/url]") + .unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 1); + } + + #[test] + fn parse_url_tag_2() { + let (vec, rest) = node_vec(None) + .easy_parse("[url]http://example.com[/url]") + .unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 1); + } + + #[test] + fn parse_invalid_url_tag() { + let input = "[url]not a url[/url]"; + let (vec, rest) = node_vec(None).easy_parse(input).unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), input.len()); + } + + #[test] + fn parse_invalid_url_tag_2() { + let input = "[url=bad]not a url[/url]"; + let (vec, rest) = node_vec(None).easy_parse(input).unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), input.len()); + } + + #[test] + fn parse_text_with_color_name() { + let input = "some [color=white]text[/color]"; + let (vec, rest) = node_vec(None).easy_parse(input).unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 6); + } + + #[test] + fn parse_text_with_color_hash() { + let input = "some [color=#fff]text[/color]"; + let (vec, rest) = node_vec(None).easy_parse(input).unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 6); + } + + #[test] + fn parse_text_with_mixed_tags() { + let (vec, rest) = node_vec(None) + .easy_parse("[bold]bold text[/bold] with a [bad] tag and a [hr] good tag") + .unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 35); + } + + #[test] + fn parse_handle_node() { + let (vec, rest) = node_vec(None).easy_parse("@one@two").unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 1); + } + + #[test] + fn parse_handle_node_in_text() { + let (vec, rest) = node_vec(None).easy_parse("before @han@dle after").unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 14); + } + + #[test] + fn parse_icon_tag() { + let (vec, rest) = node_vec(None).easy_parse("[icon]@han@dle[/icon]").unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 1); + } + + #[test] + fn parse_invalid_icon_tag() { + let input = "[icon]bad[/icon]"; + let (vec, rest) = node_vec(None).easy_parse(input).unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), input.len()); + } + + #[test] + fn parse_icontext_tag() { + let (vec, rest) = node_vec(None) + .easy_parse("[icontext]@han@dle[/icontext]") + .unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 1); + } + + #[test] + fn parse_invalid_icontext_tag() { + let input = "[icontext]bad[/icontext]"; + let (vec, rest) = node_vec(None).easy_parse(input).unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), input.len()); + } + + #[test] + fn parse_email_node() { + let (vec, rest) = node_vec(None).easy_parse("one.two@three.four").unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 1); + } + + #[test] + fn parse_email_in_text() { + let (vec, rest) = node_vec(None) + .easy_parse("this is a string with.an@email") + .unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 18); + } + + #[test] + fn parse_newline() { + let (vec, rest) = node_vec(None).easy_parse("\n").unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 1); + } + + #[test] + fn parse_multiple_newlines() { + let (vec, rest) = node_vec(None).easy_parse("\n\n\n").unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 1); + } + + #[test] + fn parse_newlines_in_text() { + let (vec, rest) = node_vec(None).easy_parse("hewwo\n\n\nmr\nobama\n").unwrap(); + assert_eq!(rest, ""); + assert_eq!(vec.len(), 15); + } +} diff --git a/content/src/color.rs b/content/src/color.rs new file mode 100644 index 0000000..14da45a --- /dev/null +++ b/content/src/color.rs @@ -0,0 +1,43 @@ +use combine::{ + choice, count, many1, + parser::char::{char as parsechar, hex_digit, lower}, + Parser, Stream, +}; + +pub(crate) fn color() -> impl Parser +where + Input: Stream, +{ + let hashcolor = parsechar('#') + .with(count(8, hex_digit())) + .map(|color: String| format!("#{}", color)); + + let namecolor = many1(lower()); + + choice((hashcolor, namecolor)) +} + +#[cfg(test)] +mod tests { + use super::*; + use combine::EasyParser; + + #[test] + fn parse_shortcolor() { + let (_, rest) = color().easy_parse("#aaa").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_longcolor() { + let (_, rest) = color().easy_parse("#aaaaaaff").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_colorname() { + let (value, rest) = color().easy_parse("white").unwrap(); + assert_eq!(rest, ""); + assert_eq!(value, "white"); + } +} diff --git a/content/src/email.rs b/content/src/email.rs new file mode 100644 index 0000000..40c72ec --- /dev/null +++ b/content/src/email.rs @@ -0,0 +1,127 @@ +use crate::url::domain; +use combine::{ + choice, many, many1, + parser::char::{alpha_num, char as parsechar}, + Parser, Stream, +}; + +fn unquoted() -> impl Parser +where + Input: Stream, +{ + choice(( + alpha_num(), + parsechar('!'), + parsechar('#'), + parsechar('$'), + parsechar('%'), + parsechar('&'), + parsechar('\''), + parsechar('*'), + parsechar('+'), + parsechar('-'), + parsechar('/'), + parsechar('='), + parsechar('?'), + parsechar('^'), + parsechar('`'), + parsechar('{'), + parsechar('|'), + parsechar('}'), + parsechar('~'), + )) +} + +fn unquoted_middle_segment() -> impl Parser +where + Input: Stream, +{ + parsechar('.') + .and(many1(unquoted())) + .map(|(c, s): (_, String)| { + let mut string = String::new(); + string.push(c); + string += &s; + string + }) +} + +fn unquoted_full() -> impl Parser +where + Input: Stream, +{ + many1(unquoted()) + .and(many(unquoted_middle_segment())) + .map(|(s1, s2): (String, String)| s1 + &s2) +} + +pub(crate) fn email() -> impl Parser +where + Input: Stream, +{ + unquoted_full() + .skip(parsechar('@')) + .and(domain().map(|d| d.0)) + .map(|(local, domain)| format!("{}@{}", local, domain)) +} + +#[cfg(test)] +mod tests { + use super::*; + use combine::EasyParser; + + #[test] + fn unquoted_parses_chars() { + for c in &['a', 'b', '$', '#'] { + let s = c.to_string(); + let (_, rest) = unquoted().easy_parse(s.as_str()).unwrap(); + assert_eq!(rest, ""); + } + } + + #[test] + fn unquoted_middle_segment_parses_dots() { + let (_, rest) = unquoted_middle_segment().easy_parse(".one").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn unquoted_full_parses_dots() { + let (_, rest) = unquoted_full().easy_parse("one.two.three.four").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parses_basic_email() { + let (_, rest) = email().easy_parse("a@b").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn doesnt_parse_invalid_email() { + assert!(email().easy_parse("@a@b").is_err()); + } + + #[test] + fn parses_longer_email() { + let (_, rest) = email() + .easy_parse("one.two.three.four@sub.domain.tld") + .unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn doesnt_parse_double_dot() { + assert!(email().easy_parse("bad..email@tld").is_err()); + } + + #[test] + fn doesnt_parse_dot_local() { + assert!(email().easy_parse(".local@tld").is_err()); + } + + #[test] + fn doesnt_parse_end_dot_local() { + assert!(email().easy_parse("local.@tld").is_err()); + } +} diff --git a/content/src/handle.rs b/content/src/handle.rs new file mode 100644 index 0000000..d602351 --- /dev/null +++ b/content/src/handle.rs @@ -0,0 +1,68 @@ +use combine::{many1, parser::char::alpha_num, Parser, Stream}; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Handle { + pub handle: String, + pub domain: String, +} + +fn at() -> impl Parser +where + Input: Stream, +{ + combine::parser::char::char('@') +} + +fn handle_part() -> impl Parser +where + Input: Stream, +{ + many1(alpha_num()) +} + +fn domain_part() -> impl Parser +where + Input: Stream, +{ + crate::url::domain().map(|d| d.0) +} + +pub(crate) fn handle() -> impl Parser +where + Input: Stream, +{ + at().with(handle_part()) + .skip(at()) + .and(domain_part()) + .map(|(handle, domain)| Handle { handle, domain }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_handle_part() { + let (_, rest) = handle_part().parse("as123").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_simple_handle() { + let (_, rest) = handle().parse("@asdf@asdf").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_complex_handle() { + let (_, rest) = handle() + .parse("@r2d2@telnet.towel.blinkenlights.nl") + .unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn dont_parse_invalid_handle() { + assert!(handle().parse("asdf@asdf").is_err()) + } +} diff --git a/content/src/lib.rs b/content/src/lib.rs index fa4ded2..cf178cf 100644 --- a/content/src/lib.rs +++ b/content/src/lib.rs @@ -3,6 +3,16 @@ use once_cell::sync::Lazy; use std::borrow::Cow; use std::collections::{HashMap, HashSet}; +mod bbcode; +mod color; +mod email; +mod handle; +mod render; +mod url; + +pub use bbcode::Tag; +pub use render::NodeView; + fn allow_styles<'u>(allowed: &[&str], value: &'u str) -> Option> { let mut altered = false; let rules: Vec<_> = value @@ -38,9 +48,7 @@ fn attribute_filter<'u>(element: &str, attribute: &str, value: &'u str) -> Optio ("span", "style") => allow_styles(&["color", "opacity"], value), ("div", "class") | ("span", "class") - | ("figure", "class") | ("pre", "class") - | ("pre", "data-language") | ("span", "data-symbol") | ("blockquote", "data-author") | ("a", "rel") @@ -53,44 +61,56 @@ fn attribute_filter<'u>(element: &str, attribute: &str, value: &'u str) -> Optio } } -// Classes based on bbclash BBCode spec: -// https://github.com/EndaHallahan/BBClash/blob/master/Spec.md +static STRIP_CONFIG: Lazy = Lazy::new(|| { + let mut builder = Builder::new(); + builder.allowed_classes(HashMap::new()).tags(HashSet::new()); + builder +}); + static AMMONIA_CONFIG: Lazy = Lazy::new(|| { let mut classes = HashMap::new(); let div_hs = classes.entry("div").or_insert(HashSet::new()); div_hs.insert("center"); div_hs.insert("right"); - div_hs.insert("math_container"); - // div_hs.insert("embed"); for now, no embeds - div_hs.insert("indent-1"); - div_hs.insert("indent-2"); - div_hs.insert("indent-3"); - div_hs.insert("indent-4"); let span_hs = classes.entry("span").or_insert(HashSet::new()); span_hs.insert("underline"); span_hs.insert("smallcaps"); span_hs.insert("monospace"); span_hs.insert("spoiler"); - span_hs.insert("math_container"); let pre_hs = classes.entry("pre").or_insert(HashSet::new()); pre_hs.insert("codeblock"); - let figure_hs = classes.entry("figure").or_insert(HashSet::new()); - figure_hs.insert("figure-right"); - figure_hs.insert("figure-left"); - let mut schemes = HashSet::new(); schemes.insert("http"); schemes.insert("https"); schemes.insert("mailto"); + let mut tags = HashSet::new(); + tags.insert("div"); + tags.insert("span"); + tags.insert("pre"); + tags.insert("code"); + tags.insert("i"); + tags.insert("em"); + tags.insert("b"); + tags.insert("strong"); + tags.insert("s"); + tags.insert("sub"); + tags.insert("sup"); + tags.insert("blockquote"); + tags.insert("a"); + tags.insert("img"); + tags.insert("br"); + let mut builder = Builder::new(); builder + .tags(tags) .allowed_classes(classes) .url_schemes(schemes) + .link_rel(Some("nofollow noopener noreferer")) .attribute_filter(attribute_filter) .add_tag_attributes("span", &["style"]) .add_tag_attributes("div", &["style"]); @@ -104,8 +124,12 @@ pub fn html(source: &str) -> String { h } -pub fn bbcode(source: &str) -> String { - let bb = bbclash::bbcode_to_html(source); - log::debug!("{}", bb); - bb +pub fn bbcode(source: &str, mapper: F) -> String +where + for<'a> F: Fn(NodeView<'a>) -> NodeView<'a> + Copy, +{ + let stripped = STRIP_CONFIG.clean(source).to_string(); + let preprocessed = render::preprocessor(&stripped, mapper); + log::debug!("{}", preprocessed); + preprocessed } diff --git a/content/src/render.rs b/content/src/render.rs new file mode 100644 index 0000000..862e3e3 --- /dev/null +++ b/content/src/render.rs @@ -0,0 +1,483 @@ +use crate::bbcode::Tag; +use std::borrow::Cow; + +#[derive(Debug)] +pub enum NodeView<'a> { + Tag { + tag: Tag, + attr: Option>, + }, + Url { + href: Cow<'a, str>, + }, + IconText { + handle: Cow<'a, str>, + domain: Cow<'a, str>, + img: Option, + href: Option, + }, + Icon { + handle: Cow<'a, str>, + domain: Cow<'a, str>, + img: Option, + href: Option, + }, + Handle { + handle: Cow<'a, str>, + domain: Cow<'a, str>, + href: Option, + }, + Email { + email: Cow<'a, str>, + }, + Text { + text: Cow<'a, str>, + }, + Newline, +} + +#[derive(Debug)] +enum Node { + Tag { + tag: Tag, + attr: Option, + children: Vec, + }, + Url { + href: String, + }, + Handle { + handle: String, + domain: String, + }, + Email { + email: String, + }, + Text { + text: String, + }, + Newline, +} + +#[derive(Debug)] +enum RenderNode { + Tag { + tag: Tag, + attr: Option, + children: Vec, + }, + Url { + href: String, + }, + IconText { + handle: String, + domain: String, + img: String, + href: String, + }, + Icon { + handle: String, + domain: String, + img: String, + href: String, + }, + Handle { + handle: String, + domain: String, + href: String, + }, + Email { + email: String, + }, + Text { + text: String, + }, + Newline, +} + +fn render_nodes(nodes: Vec) -> String { + nodes + .into_iter() + .map(|node| { + log::trace!("Rendering {:?}", node); + match node { + RenderNode::Tag { + tag, + attr, + children, + } => match tag { + Tag::Right if !children.is_empty() => { + String::new() + "
" + &render_nodes(children) + "
" + } + Tag::Center if !children.is_empty() => { + String::new() + + "
" + + &render_nodes(children) + + "
" + } + Tag::Quote if !children.is_empty() => { + String::new() + "
" + &render_nodes(children) + "
" + } + Tag::Color if !children.is_empty() => { + if let Some(attr) = attr { + format!("", attr) + + &render_nodes(children) + + "" + } else { + render_nodes(children) + } + } + Tag::Code if !children.is_empty() => { + String::new() + "" + &render_nodes(children) + "" + } + Tag::Codeblock if !children.is_empty() => { + String::new() + + "
"
+                            + &render_nodes(children)
+                            + "
" + } + Tag::Pre if !children.is_empty() => { + String::new() + "
" + &render_nodes(children) + "
" + } + Tag::Mono if !children.is_empty() => { + String::new() + + "" + + &render_nodes(children) + + "" + } + Tag::Sub if !children.is_empty() => { + String::new() + "" + &render_nodes(children) + "" + } + Tag::Sup if !children.is_empty() => { + String::new() + "" + &render_nodes(children) + "" + } + Tag::S if !children.is_empty() => { + String::new() + "" + &render_nodes(children) + "" + } + Tag::Spoiler if !children.is_empty() => { + String::new() + + "" + + &render_nodes(children) + + "" + } + Tag::Bold if !children.is_empty() => { + String::new() + "" + &render_nodes(children) + "" + } + Tag::Strong if !children.is_empty() => { + String::new() + "" + &render_nodes(children) + "" + } + Tag::I if !children.is_empty() => { + String::new() + "" + &render_nodes(children) + "" + } + Tag::Em if !children.is_empty() => { + String::new() + "" + &render_nodes(children) + "" + } + Tag::U if !children.is_empty() => { + String::new() + + "" + + &render_nodes(children) + + "" + } + Tag::Smcaps if !children.is_empty() => { + String::new() + + "" + + &render_nodes(children) + + "" + } + Tag::IconText if !children.is_empty() => render_nodes(children), + Tag::Icon if !children.is_empty() => render_nodes(children), + Tag::Hr => String::from("
"), + Tag::Url if !children.is_empty() => { + if let Some(href) = attr { + format!("", href) + + &render_nodes(children) + + "" + } else { + render_nodes(children) + } + } + _ => String::new(), + }, + RenderNode::Url { href } => format!( + "{href}", + href = href + ), + RenderNode::IconText { + handle, + domain, + img, + href, + } => { + format!("", href) + + &format!( + "\"@{handle}@{domain}\"", + img, + handle = handle, + domain = domain + ) + &format!("@{}@{}", handle, domain) + + &format!("") + } + RenderNode::Icon { + handle, + domain, + img, + href, + } => { + format!("", href) + + &format!( + "\"@{handle}@{domain}\"", + img, + handle = handle, + domain = domain + ) + &format!("") + } + RenderNode::Handle { + handle, + domain, + href, + } => { + format!("", href) + + &format!("@{}@{}", handle, domain) + + &format!("") + } + RenderNode::Email { email } => { + format!("{email}", email = email) + } + RenderNode::Text { text } => text, + RenderNode::Newline => format!("
"), + } + }) + .collect::() +} + +fn to_render<'b, F>(node: NodeView<'b>, children: Option>, f: F) -> RenderNode +where + for<'a> F: Fn(NodeView<'a>) -> NodeView<'a> + Copy, +{ + match node { + NodeView::Tag { tag, attr } => RenderNode::Tag { + tag, + attr: attr.map(|a| a.to_string()), + children: map_nodes(children.unwrap_or(vec![]), f), + }, + NodeView::Url { href } => RenderNode::Url { + href: href.to_string(), + }, + NodeView::IconText { + handle, + domain, + img, + href, + } => match (img, href) { + (Some(img), Some(href)) => RenderNode::IconText { + handle: handle.to_string(), + domain: domain.to_string(), + img, + href, + }, + (None, Some(href)) => RenderNode::Handle { + handle: handle.to_string(), + domain: domain.to_string(), + href, + }, + _ => RenderNode::Text { + text: format!("@{}@{}", handle, domain), + }, + }, + NodeView::Icon { + handle, + domain, + img, + href, + } => match (img, href) { + (Some(img), Some(href)) => RenderNode::Icon { + handle: handle.to_string(), + domain: domain.to_string(), + img, + href, + }, + (None, Some(href)) => RenderNode::Handle { + handle: handle.to_string(), + domain: domain.to_string(), + href, + }, + _ => RenderNode::Text { + text: format!("@{}@{}", handle, domain), + }, + }, + NodeView::Handle { + handle, + domain, + href, + } => match href { + Some(href) => RenderNode::Handle { + handle: handle.to_string(), + domain: domain.to_string(), + href: href.to_string(), + }, + None => RenderNode::Text { + text: format!("@{}@{}", handle, domain), + }, + }, + NodeView::Email { email } => RenderNode::Email { + email: email.to_string(), + }, + NodeView::Text { text } => RenderNode::Text { + text: text.to_string(), + }, + NodeView::Newline => RenderNode::Newline, + } +} + +fn map_nodes(nodes: Vec, f: F) -> Vec +where + for<'a> F: Fn(NodeView<'a>) -> NodeView<'a> + Copy, +{ + nodes + .into_iter() + .map(move |node| { + log::trace!("Mapping {:?}", node); + match node { + Node::Tag { + tag, + attr, + children, + } => to_render( + (f)(NodeView::Tag { + tag, + attr: attr.as_deref().map(Cow::Borrowed), + }), + Some(children), + f, + ), + Node::Url { href } => to_render( + (f)(NodeView::Url { + href: Cow::Borrowed(&href), + }), + None, + f, + ), + Node::Handle { handle, domain } => to_render( + (f)(NodeView::Handle { + handle: Cow::Borrowed(&handle), + domain: Cow::Borrowed(&domain), + href: None, + }), + None, + f, + ), + Node::Email { email } => to_render( + (f)(NodeView::Email { + email: Cow::Borrowed(&email), + }), + None, + f, + ), + Node::Text { text } => to_render( + (f)(NodeView::Text { + text: Cow::Borrowed(&text), + }), + None, + f, + ), + Node::Newline => to_render((f)(NodeView::Newline), None, f), + } + }) + .collect() +} + +fn build_nodes(input: Vec) -> Vec { + let mut nodes = vec![]; + + for n in input { + log::trace!("Building {:?}", n); + match n { + crate::bbcode::Node::TagNode { + tag, + attr, + children, + } => nodes.push(Node::Tag { + tag, + attr, + children: build_nodes(children), + }), + crate::bbcode::Node::UrlNode { url } => nodes.push(Node::Url { + href: url.to_string(), + }), + crate::bbcode::Node::HandleNode { handle, domain } => { + nodes.push(Node::Handle { handle, domain }) + } + crate::bbcode::Node::EmailNode { email } => nodes.push(Node::Email { email }), + crate::bbcode::Node::CharNode { text: c_text } => match nodes.last_mut() { + Some(Node::Text { ref mut text }) => { + text.push(c_text); + } + _ => { + let mut text = String::new(); + text.push(c_text); + nodes.push(Node::Text { text }); + } + }, + crate::bbcode::Node::NewlineNode => nodes.push(Node::Newline), + }; + } + + nodes +} + +pub(crate) fn preprocessor(source: &str, mapper: F) -> String +where + for<'a> F: Fn(NodeView<'a>) -> NodeView<'a> + Copy, +{ + use combine::Parser; + + let parsenodes = crate::bbcode::node_vec(None) + .parse(source) + .ok() + .map(|(nodes, rest)| { + if rest.len() > 0 { + log::warn!("Failed to parse '{}', rest: '{}'", source, rest); + } + + nodes + }) + .unwrap_or(vec![]); + + render_nodes(map_nodes(build_nodes(parsenodes), mapper)) +} + +#[cfg(test)] +mod tests { + use super::preprocessor; + + #[test] + fn basic_parse() { + let input = "some plain text"; + let output = preprocessor(input, |view| view); + assert_eq!(output, input) + } + + #[test] + fn parse_with_link() { + let input = "it's http://example.com a link"; + let output = preprocessor(input, |view| view); + assert_eq!(output, "it's http://example.com a link"); + } + + #[test] + fn parse_with_custom_link() { + let input = "it's [url=http://example.com]a link[/url]"; + let output = preprocessor(input, |view| view); + assert_eq!( + output, + "it's a link" + ); + } + + #[test] + fn parse_with_strong() { + let input = "it's [strong]bold[/strong] right"; + let output = preprocessor(input, |view| view); + assert_eq!(output, "it's bold right"); + } +} diff --git a/content/src/url.rs b/content/src/url.rs new file mode 100644 index 0000000..a581ee6 --- /dev/null +++ b/content/src/url.rs @@ -0,0 +1,401 @@ +use combine::{ + choice, many, many1, optional, + parser::char::{alpha_num, digit, hex_digit, string}, + Parser, Stream, +}; + +#[derive(Clone, Copy, Debug)] +enum Scheme { + Http, + Https, +} + +#[derive(Clone, Debug)] +pub(crate) struct Domain(pub(crate) String); + +#[derive(Clone, Debug)] +struct Port(String); + +#[derive(Clone, Debug)] +struct Path(String); + +#[derive(Clone, Debug)] +struct Query(String); + +#[derive(Clone, Debug)] +struct Fragment(String); + +#[derive(Clone, Debug)] +pub struct Url { + scheme: Scheme, + domain: Domain, + port: Option, + path: Option, + query: Option, + fragment: Option, +} + +fn scheme() -> impl Parser +where + Input: Stream, +{ + let http = string("http"); + let s = combine::parser::char::char('s'); + + let https = http.and(optional(s)).map(|(_, c)| { + if c.is_some() { + Scheme::Https + } else { + Scheme::Http + } + }); + + let separator = string("://"); + + https.skip(separator) +} + +fn domain_text() -> impl Parser +where + Input: Stream, +{ + let domain_char = alpha_num().or(combine::parser::char::char('-')); + many1(domain_char) +} + +pub(crate) fn domain() -> impl Parser +where + Input: Stream, +{ + let domain_segment = + combine::parser::char::char('.') + .and(domain_text()) + .map(|(c, s): (char, String)| { + let mut string = String::new(); + string.push(c); + string += &s; + string + }); + + domain_text() + .and(many(domain_segment)) + .map(|(first, rest): (String, String)| Domain(first + &rest)) +} + +fn port() -> impl Parser +where + Input: Stream, +{ + let colon = combine::parser::char::char(':'); + let port = many1(digit()).map(Port); + + colon.with(port) +} + +fn subdelim() -> impl Parser +where + Input: Stream, +{ + let exclamation = combine::parser::char::char('!'); + let dollar = combine::parser::char::char('$'); + let and = combine::parser::char::char('&'); + let apostrophe = combine::parser::char::char('\''); + let open_paren = combine::parser::char::char('('); + let close_paren = combine::parser::char::char(')'); + let asterisk = combine::parser::char::char('*'); + let plus = combine::parser::char::char('+'); + let comma = combine::parser::char::char(','); + let semi_colon = combine::parser::char::char(';'); + let equal = combine::parser::char::char('='); + + choice(( + exclamation, + dollar, + and, + apostrophe, + open_paren, + close_paren, + asterisk, + plus, + comma, + semi_colon, + equal, + )) +} + +fn unreserved() -> impl Parser +where + Input: Stream, +{ + let dash = combine::parser::char::char('-'); + let dot = combine::parser::char::char('.'); + let underscore = combine::parser::char::char('_'); + let tilde = combine::parser::char::char('~'); + + choice((dash, dot, underscore, tilde, alpha_num())) +} + +fn pct_encoded() -> impl Parser +where + Input: Stream, +{ + combine::parser::char::char('%') + .and(hex_digit()) + .and(hex_digit()) + .map(|((s1, s2), s3)| { + let mut s = String::new(); + s.push(s1); + s.push(s2); + s.push(s3); + s + }) +} + +fn pchar() -> impl Parser +where + Input: Stream, +{ + choice(( + unreserved().map(String::from), + pct_encoded(), + subdelim().map(String::from), + combine::parser::char::char(':').map(String::from), + combine::parser::char::char('@').map(String::from), + )) +} + +fn path() -> impl Parser +where + Input: Stream, +{ + let slash = combine::parser::char::char('/'); + + let segment = many(pchar()); + + let path_part = slash + .and(segment) + .map(|(slash, segment): (char, String)| String::new() + &slash.to_string() + &segment); + + many1(path_part).map(Path) +} + +fn question() -> impl Parser +where + Input: Stream, +{ + combine::parser::char::char('?') +} + +fn query() -> impl Parser +where + Input: Stream, +{ + let query_char = choice(( + pchar(), + combine::parser::char::char('/').map(String::from), + question().map(String::from), + )); + + question().with(many(query_char)).map(Query) +} + +fn fragment() -> impl Parser +where + Input: Stream, +{ + let hash = combine::parser::char::char('#'); + + let fragment_char = choice(( + pchar(), + combine::parser::char::char('/').map(String::from), + combine::parser::char::char('?').map(String::from), + )); + + hash.with(many(fragment_char)).map(Fragment) +} + +pub(crate) fn url() -> impl Parser +where + Input: Stream, +{ + scheme() + .and(domain()) + .and(optional(port())) + .and(optional(path())) + .and(optional(query())) + .and(optional(fragment())) + .map( + |(((((scheme, domain), port), path), query), fragment)| Url { + scheme, + domain, + port, + path, + query, + fragment, + }, + ) +} + +impl std::fmt::Display for Scheme { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Scheme::Http => write!(f, "http"), + Scheme::Https => write!(f, "https"), + } + } +} + +impl std::fmt::Display for Url { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}://", self.scheme)?; + write!(f, "{}", self.domain.0)?; + + if let Some(port) = &self.port { + write!(f, ":{}", port.0)?; + } + if let Some(path) = &self.path { + write!(f, "{}", path.0)?; + } + if let Some(query) = &self.query { + write!(f, "?{}", query.0)?; + } + if let Some(fragment) = &self.fragment { + write!(f, "#{}", fragment.0)?; + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use combine::EasyParser; + + #[test] + fn parse_https() { + let (_, rest) = scheme().easy_parse("https://").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_http() { + let (_, rest) = scheme().easy_parse("http://").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_domain_text() { + let (_, rest) = domain_text().easy_parse("hyaenidae-3").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_simple_domain() { + let (_, rest) = domain().easy_parse("example.com").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_single_domain() { + let (_, rest) = domain().easy_parse("hyaenidae-3").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_subdomains() { + let (_, rest) = domain().easy_parse("one.two.three.four").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_single_slash() { + let (_, rest) = path().easy_parse("/").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_double_slash() { + let (_, rest) = path().easy_parse("//").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_deep_path() { + let (_, rest) = path().easy_parse("/one/two/three/four").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn dont_parse_invalid_path() { + assert!(path().easy_parse("asdf").is_err()); + } + + #[test] + fn parse_empty_query() { + query().easy_parse("?").unwrap(); + } + + #[test] + fn parse_long_query() { + let (_, rest) = query().easy_parse("?one=two&three=four").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn dont_parse_invalid_query() { + assert!(query().easy_parse("asdf").is_err()); + } + + #[test] + fn parse_empty_fragment() { + let (_, rest) = fragment().easy_parse("#").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_long_fragment() { + let (_, rest) = fragment().easy_parse("#asdf-5").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn dont_parse_invalid_fragment() { + assert!(fragment().easy_parse("asdf").is_err()); + } + + #[test] + fn parse_example_com() { + let (_, rest) = url().easy_parse("http://example.com").unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn parse_extended_example_com() { + let (_, rest) = url() + .easy_parse("https://www.example.com/path/part?query=hi#fragment") + .unwrap(); + assert_eq!(rest, ""); + } + + #[test] + fn round_trip_example_com() { + let (url, rest) = url().easy_parse("http://example.com").unwrap(); + let url_str = url.to_string(); + assert_eq!(rest, ""); + assert_eq!(url_str, "http://example.com") + } + + #[test] + fn round_trip_extended_example_com() { + let (url, rest) = url() + .easy_parse("https://www.example.com/path/part?query=hi#fragment") + .unwrap(); + let url_str = url.to_string(); + assert_eq!(rest, ""); + assert_eq!( + url_str, + "https://www.example.com/path/part?query=hi#fragment" + ); + } +}