use pulldown parser for md

This commit is contained in:
Maciej Jur 2024-04-16 23:53:46 +02:00
parent 289a1a4a4d
commit b41c815aed
Signed by: kamov
GPG key ID: 191CBFF5F72ECAFD
13 changed files with 203 additions and 249 deletions

103
Cargo.lock generated
View file

@ -57,6 +57,12 @@ dependencies = [
"unscanny",
]
[[package]]
name = "bitflags"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
[[package]]
name = "bumpalo"
version = "3.15.4"
@ -132,23 +138,6 @@ version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e769b5c8c8283982a987c6e948e540254f1058d5a74b8794914d4ef5fc2a24"
[[package]]
name = "comrak"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0436149c9f6a1935b13306206c739b1ba84fa81f551b5eb87fc2ca7a13700af"
dependencies = [
"derive_builder",
"emojis",
"entities",
"memchr",
"once_cell",
"regex",
"slug",
"typed-arena",
"unicode_categories",
]
[[package]]
name = "copy_dir"
version = "0.1.3"
@ -236,12 +225,6 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "deunicode"
version = "1.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6e854126756c496b8c81dec88f9a706b15b875c5849d4097a3854476b9fdf94"
[[package]]
name = "displaydoc"
version = "0.2.4"
@ -259,21 +242,6 @@ version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
[[package]]
name = "emojis"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3407bc749191827d456a282321770847daf4b0a1128fde02597a8ed2e987b95d"
dependencies = [
"phf",
]
[[package]]
name = "entities"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5320ae4c3782150d900b79807611a59a99fc9a1d61d686faafc24b93fc8d7ca"
[[package]]
name = "equivalent"
version = "1.0.1"
@ -295,6 +263,15 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "getopts"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
dependencies = [
"unicode-width",
]
[[package]]
name = "getrandom"
version = "0.2.12"
@ -688,6 +665,23 @@ dependencies = [
"version_check",
]
[[package]]
name = "pulldown-cmark"
version = "0.10.0"
source = "git+https://github.com/pulldown-cmark/pulldown-cmark.git?branch=branch_0.11#2540d4095d5f94fd972c93b927e9bec83b0c5079"
dependencies = [
"bitflags",
"getopts",
"memchr",
"pulldown-cmark-escape",
"unicase",
]
[[package]]
name = "pulldown-cmark-escape"
version = "0.10.0"
source = "git+https://github.com/pulldown-cmark/pulldown-cmark.git?branch=branch_0.11#2540d4095d5f94fd972c93b927e9bec83b0c5079"
[[package]]
name = "quick-js"
version = "0.4.1"
@ -861,23 +855,12 @@ version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
[[package]]
name = "slug"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3bd94acec9c8da640005f8e135a39fc0372e74535e6b368b7a04b875f784c8c4"
dependencies = [
"deunicode",
"wasm-bindgen",
]
[[package]]
name = "ssg"
version = "0.1.0"
dependencies = [
"aho-corasick",
"chrono",
"comrak",
"glob",
"grass",
"gray_matter",
@ -885,6 +868,7 @@ dependencies = [
"hypertext",
"katex",
"once_cell",
"pulldown-cmark",
"regex",
"serde",
"tree-sitter",
@ -1158,12 +1142,6 @@ dependencies = [
"tree-sitter",
]
[[package]]
name = "typed-arena"
version = "2.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a"
[[package]]
name = "unic-langid"
version = "0.9.4"
@ -1183,6 +1161,15 @@ dependencies = [
"tinystr",
]
[[package]]
name = "unicase"
version = "2.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89"
dependencies = [
"version_check",
]
[[package]]
name = "unicode-bidi"
version = "0.3.15"
@ -1211,10 +1198,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
[[package]]
name = "unicode_categories"
version = "0.1.1"
name = "unicode-width"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
[[package]]
name = "unsafe-libyaml"

View file

@ -6,7 +6,6 @@ edition = "2021"
[dependencies]
aho-corasick = "1.1.3"
chrono = "0.4.35"
comrak = { version = "0.22.0", default-features = false, features = ["shortcodes"] }
glob = "0.3.1"
grass = { version = "0.13.2", default-features = false, features = ["random"] }
gray_matter = { version = "0.2.6", default-features = false, features = ["yaml"] }
@ -17,10 +16,14 @@ once_cell = "1.19.0"
regex = "1.10.4"
serde = { version = "1.0.197", features = ["derive"] }
# Markdown
pulldown-cmark = { git = "https://github.com/pulldown-cmark/pulldown-cmark.git", branch = "branch_0.11" }
# Treesitter
tree-sitter = "0.20.10"
tree-sitter-highlight = "0.20.1"
# Treesitter languages
tree-sitter-astro = { git = "https://github.com/virchau13/tree-sitter-astro.git", rev = "e924787e12e8a03194f36a113290ac11d6dc10f3" }
tree-sitter-css = "0.20.0"
tree-sitter-haskell = { git = "https://github.com/tree-sitter/tree-sitter-haskell", rev = "1da347c88599faad7964e63facead5d163ac7dba" }

View file

@ -1,7 +1,3 @@
---
title: "Test"
date: 2021-09-10T19:34:01+02:00
---
# Welcome to my website! :heart:
You have found this little floating rock in the middle of the Internet! Congrats 🎉

View file

@ -6,16 +6,15 @@ tags: [Japanese, zola, hugo, astro]
Sadly, as far as I know CommonMark currently doesn't include anything about ruby in its spec. On top of that ruby is pretty uncommon, so it is pretty rare for any ruby extensions to exist. As I move through any new frameworks, I will try to document any simple solutions that I figure out.
## Examples
| Language | Example |
| -------- | ------- |
| Japanese | :ruby[日本語]{help=;ほん;ご}の:ruby[文法]{help=ぶん;ぽう}は:ruby[難]{help=むずか}しい |
| Chinese | :ruby[北京]{help=Běi;jīng}<br/>:ruby[北京]{help=ㄅㄟˇ;ㄐㄧㄥ} |
| Korean | :ruby[韓國]{help=;국} |
| Vietnamese | :ruby[河內]{help=;Nội} |
| Other | I :ruby[love]{help=like} ruby! |
| Japanese | [日本語]{にほんご}の[文法]{ぶんぽう}は[難]{むずか}しい |
| Chinese | [北京]{Běijīng}<br/>[北京]{ㄅㄟˇㄐㄧㄥ} |
| Korean | [韓國]{한국} |
| Vietnamese | [河內]{HàNội} |
| Other | I [love]{like} ruby! |
## Remark

View file

@ -1,6 +1,6 @@
use hypertext::{html_elements, maud, maud_move, GlobalAttributes, Raw, Renderable};
use crate::md::render;
use crate::text::md::parse;
use super::page;
@ -21,7 +21,7 @@ const INTRO: &str = r#"
fn intro() -> impl Renderable {
maud!(
section .p-card.intro-jp lang="ja-JP" {
(Raw(render(INTRO)))
(Raw(parse(INTRO)))
}
)
}

View file

@ -1,7 +1,7 @@
use std::process::Command;
use std::{collections::HashMap, path::Path};
use std::fs;
use chrono::{Datelike, Utc};
use chrono::Datelike;
use grass;
use html::LinkableData;
use hypertext::{Raw, Renderable};
@ -12,6 +12,7 @@ mod html;
mod ts;
mod gen;
mod utils;
mod text;
#[derive(Debug)]
@ -90,7 +91,7 @@ impl Transformable for md::Post {
}
fn render(data: &str) -> String {
md::render(data)
text::md::parse(data)
}
}
@ -115,7 +116,7 @@ impl Transformable for md::Slide {
fn render(data: &str) -> String {
data
.split("\n-----\n")
.map(|chunk| chunk.split("\n---\n").map(md::render).collect::<Vec<_>>())
.map(|chunk| chunk.split("\n---\n").map(text::md::parse).collect::<Vec<_>>())
.map(|stack| match stack.len() > 1 {
true => format!("<section>{}</section>", stack.into_iter().map(|slide| format!("<section>{slide}</section>")).collect::<String>()),
false => format!("<section>{}</section>", stack[0])
@ -138,7 +139,7 @@ impl Transformable for md::Wiki {
}
fn render(data: &str) -> String {
md::render(data)
text::md::parse(data)
}
}
@ -246,7 +247,7 @@ fn main() {
gen::Asset {
kind: gen::AssetKind::Html(Box::new(|_| {
let data = std::fs::read_to_string("content/index.md").unwrap();
let data = md::render(&data);
let data = text::md::parse(&data);
html::home(Raw(data)).render().to_owned().into()
})),
out: "index.html".into(),

View file

@ -1,29 +0,0 @@
use std::cell::RefCell;
use comrak::{Arena, nodes::{Ast, AstNode, LineColumn, NodeValue}};
use hayagriva::{BibliographyDriver, Library};
use once_cell::sync::Lazy;
use regex::Regex;
use super::render::iter_nodes;
static RE_CITE: Lazy<Regex> = Lazy::new(||
Regex::new(r":cite\[(\w+)\]").unwrap()
);
pub fn add_cite<'a>(root: &'a AstNode<'a>, arena: &'a Arena<AstNode<'a>>) {
// let mut driver = BibliographyDriver::new();
iter_nodes(root, &|node| {
match &mut node.data.borrow_mut().value {
&mut NodeValue::Text(ref text) => {
for xd in RE_CITE.captures_iter(text) {
let text = xd.get(1).unwrap().as_str();
println!("{:?}", text);
}
},
_ => (),
}
});
}

View file

@ -1,8 +1,4 @@
mod matter;
mod render;
mod ruby;
mod cite;
pub use matter::{Post, Slide, Wiki};
pub use matter::preflight;
pub use render::render;

View file

@ -1,77 +0,0 @@
use std::cell::RefCell;
use comrak::{Arena, parse_document, format_html, Options};
use comrak::nodes::{Ast, AstNode, LineColumn, NodeValue};
use once_cell::unsync::Lazy;
use crate::ts;
const OPTIONS: Lazy<Options> = Lazy::new(||
Options {
extension: comrak::ExtensionOptionsBuilder::default()
.front_matter_delimiter(Some("---".into()))
.table(true)
.math_dollars(true)
.shortcodes(true)
.build()
.unwrap(),
parse: comrak::ParseOptionsBuilder::default()
.smart(true)
.build()
.unwrap(),
render: comrak::RenderOptionsBuilder::default()
.unsafe_(true)
.build()
.unwrap(),
}
);
pub fn iter_nodes<'a, F>(node: &'a AstNode<'a>, f: &F)
where F : Fn(&'a AstNode<'a>) {
f(node);
for c in node.children() {
iter_nodes(c, f);
}
}
pub fn render(raw: &str) -> String {
let arena = Arena::new();
let root = parse_document(&arena, raw, &OPTIONS);
iter_nodes(root, &|node| {
match &mut node.data.borrow_mut().value {
&mut NodeValue::CodeBlock(ref mut inner) => {
let html = ts::highlight(&inner.info, &inner.literal);
let html = hypertext::Renderable::render(html);
let elem = AstNode::new(RefCell::new(Ast::new(NodeValue::HtmlInline(html.into()), LineColumn { line: 0, column: 0 })));
let elem = arena.alloc(elem);
node.insert_before(elem);
node.detach();
},
&mut NodeValue::Math(ref text) => {
let opts = katex::opts::Opts::builder()
.output_type(katex::OutputType::Mathml)
.display_mode(text.display_math)
.build()
.unwrap();
let math = katex::render_with_opts(&text.literal, opts).unwrap();
let elem = AstNode::new(RefCell::new(Ast::new(NodeValue::HtmlInline(math.into()), LineColumn { line: 0, column: 0 })));
let elem = arena.alloc(elem);
node.insert_before(elem);
node.detach();
},
_ => (),
}
});
super::ruby::add_ruby(root, &arena);
super::cite::add_cite(root, &arena);
let mut html = vec![];
format_html(root, &OPTIONS, &mut html).unwrap();
String::from_utf8(html).unwrap()
}

View file

@ -1,63 +0,0 @@
use std::cell::RefCell;
use comrak::{Arena, nodes::{Ast, AstNode, LineColumn, NodeValue}};
use once_cell::unsync::Lazy;
use regex::Regex;
use super::render::iter_nodes;
const RE_RUBY: Lazy<Regex> = Lazy::new(||
Regex::new(r"\[([^\]]+)\]\{([^}]+)\}").unwrap()
);
#[derive(Debug)]
enum Annotated<'a> {
Text(&'a str),
Ruby(&'a str, &'a str),
}
pub fn add_ruby<'a>(root: &'a AstNode<'a>, arena: &'a Arena<AstNode<'a>>) {
iter_nodes(root, &|node| {
match &mut node.data.borrow_mut().value {
&mut NodeValue::Text(ref text) => {
for item in annotate(text) {
let new = match item {
Annotated::Text(text) => NodeValue::Text(text.into()),
Annotated::Ruby(t, f) => NodeValue::HtmlInline(format!("<ruby>{t}<rp>(</rp><rt>{f}</rt><rp>)</rp></ruby>")),
};
let elem = AstNode::new(RefCell::new(Ast::new(new, LineColumn { line: 0, column: 0 })));
let elem = arena.alloc(elem);
node.insert_before(elem)
}
node.detach();
},
_ => (),
}
});
}
fn annotate(input: &str) -> Vec<Annotated> {
let mut parts: Vec<Annotated> = Vec::new();
let mut last_index = 0;
for cap in RE_RUBY.captures_iter(input) {
let text = cap.get(1).unwrap().as_str();
let ruby = cap.get(2).unwrap().as_str();
let index = cap.get(0).unwrap().start();
if index > last_index {
parts.push(Annotated::Text(&input[last_index..index]));
}
parts.push(Annotated::Ruby(text, ruby));
last_index = cap.get(0).unwrap().end();
}
if last_index < input.len() {
parts.push(Annotated::Text(&input[last_index..]));
}
parts
}

101
src/text/md.rs Normal file
View file

@ -0,0 +1,101 @@
use hypertext::Renderable;
use once_cell::sync::Lazy;
use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag, TagEnd};
use crate::ts;
use super::ruby;
static OPTS: Lazy<Options> = Lazy::new(||
Options::empty()
.union(Options::ENABLE_MATH)
.union(Options::ENABLE_TABLES)
.union(Options::ENABLE_TASKLISTS)
.union(Options::ENABLE_STRIKETHROUGH)
.union(Options::ENABLE_SMART_PUNCTUATION)
);
static KATEX_I: Lazy<katex::Opts> = Lazy::new(||
katex::opts::Opts::builder()
.output_type(katex::OutputType::Mathml)
.build()
.unwrap()
);
static KATEX_B: Lazy<katex::Opts> = Lazy::new(||
katex::opts::Opts::builder()
.output_type(katex::OutputType::Mathml)
.display_mode(true)
.build()
.unwrap()
);
pub fn parse(text: &str) -> String {
let stream = Parser::new_ext(text, *OPTS)
.map(make_math)
.collect::<Vec<_>>();
let stream = make_code(stream)
.into_iter()
.flat_map(make_ruby);
let mut html = String::new();
pulldown_cmark::html::push_html(&mut html, stream.into_iter());
html
}
fn make_math(event: Event) -> Event {
match event {
Event::InlineMath(math) => Event::InlineHtml(katex::render_with_opts(&math, &*KATEX_I).unwrap().into()),
Event::DisplayMath(math) => Event::Html(katex::render_with_opts(&math, &*KATEX_B).unwrap().into()),
_ => event
}
}
fn make_code(es: Vec<Event>) -> Vec<Event> {
let mut buff = Vec::new();
let mut lang = None;
let mut code = String::new();
for event in es {
match event {
Event::Start(Tag::CodeBlock(kind)) => match kind {
CodeBlockKind::Indented => (),
CodeBlockKind::Fenced(name) => lang = Some(name),
},
Event::End(TagEnd::CodeBlock) => {
let lang = lang.take().unwrap_or("".into());
let html = ts::highlight(&lang, &code).render().as_str().to_owned();
buff.push(Event::Html(html.into()));
code.clear();
},
Event::Text(text) => match lang {
None => buff.push(Event::Text(text)),
Some(_) => code.push_str(&text),
},
_ => buff.push(event)
}
}
buff
}
fn make_ruby(event: Event) -> Vec<Event> {
match event {
Event::Text(text) => {
let mut buff = Vec::new();
for item in ruby::annotate(&text) {
match item {
ruby::Annotated::Text(text) => buff.push(Event::Text(text.to_owned().into())),
ruby::Annotated::Ruby(t, f) => buff.push(Event::InlineHtml(format!("<ruby>{t}<rp>(</rp><rt>{f}</rt><rp>)</rp></ruby>").into())),
};
}
buff
},
_ => vec![event],
}
}

2
src/text/mod.rs Normal file
View file

@ -0,0 +1,2 @@
pub mod md;
pub mod ruby;

38
src/text/ruby.rs Normal file
View file

@ -0,0 +1,38 @@
use once_cell::sync::Lazy;
use regex::Regex;
static RE_RUBY: Lazy<Regex> = Lazy::new(||
Regex::new(r"\[([^\]]+)\]\{([^}]+)\}").unwrap()
);
#[derive(Debug)]
pub(crate) enum Annotated<'a> {
Text(&'a str),
Ruby(&'a str, &'a str),
}
pub fn annotate(input: &str) -> Vec<Annotated> {
let mut parts: Vec<Annotated> = Vec::new();
let mut last_index = 0;
for cap in RE_RUBY.captures_iter(input) {
let text = cap.get(1).unwrap().as_str();
let ruby = cap.get(2).unwrap().as_str();
let index = cap.get(0).unwrap().start();
if index > last_index {
parts.push(Annotated::Text(&input[last_index..index]));
}
parts.push(Annotated::Ruby(text, ruby));
last_index = cap.get(0).unwrap().end();
}
if last_index < input.len() {
parts.push(Annotated::Text(&input[last_index..]));
}
parts
}