HTML
Lấy các nội dung từ các Selector
pub fn fetcher(
base_url: String,
id: &str,
list_selector: &str,
item_selector: &str,
element: &str
) -> Vec<String> {
let given_url = base_url + id;
let body = http::get_body_from_url(&given_url).unwrap();
let document = Html::parse_document(&body);
// Logging
println!("Đang tải chương truyện với URL là: {}", &given_url);
// Selector
let list = Selector::parse(list_selector).unwrap();
let item = Selector::parse(item_selector).unwrap();
let body = Selector::parse("body").unwrap();
let content_matcher = match document.select(&list).next() {
Some(doc) => {
doc.select(&item)
},
None => {
document.select(&body).next().unwrap().select(&item)
},
};
content_matcher.into_iter()
.filter_map(|f| f.value().attr(element).map(|elem| elem.to_string())).collect()
}
Cái gì đó không rõ
/// Displays formatted html that fits a CSS selector.
pub fn display_article(
contents_selector: scraper::Selector,
title_selector: scraper::Selector,
request: scraper::Html,
) {
let mut paragraphs = request
.select(&contents_selector)
.map(|x| from_read(x.inner_html().as_bytes(), 190))
.collect::<Vec<String>>()
.join("\n");
paragraphs = filters::remove_references(paragraphs);
paragraphs = filters::remove_square_brackets(paragraphs);
paragraphs = filters::remove_links(paragraphs);
let title = request
.select(&title_selector)
.map(|x| from_read(x.inner_html().as_bytes(), 50))
.collect::<Vec<String>>()
.join("");
let article = ArticleDisplay {
title: title,
contents: paragraphs,
};
ui::ArticleDisplay::new(article).unwrap();
}
Code that cleans the article from HTML leftovers.
use regex::Regex;
pub fn remove_square_brackets(text: String) -> String {
let square_bracket_regex =
Regex::new(r"[(?P<link>[a-zA-Z0-9\(\)\-\,[:space:]&=]+)]").unwrap();
square_bracket_regex.replace_all(&text, "$link").to_string()
}
pub fn remove_references(text: String) -> String {
let reference_regex = Regex::new(r"[+([0-9]+)]+").unwrap();
reference_regex.replace_all(&text, "").to_string()
}
pub fn remove_links(text: String) -> String {
let link_regex = Regex::new(r": [/?/[a-zA-Z0-9-%:/(/)_.//&=]+/]+\n").unwrap();
let note_regex =
Regex::new(r": #cite_note-[0-9-]+\n|: #cite_note-[a-zA-Z_-]+-[0-9-]+\n").unwrap();
let mut new_text = link_regex.replace_all(&text, "").to_string();
new_text = note_regex.replace_all(&new_text, "").to_string();
new_text
}
Chọn các elements trong HTML bằng select.rs
pub fn html_to_dict_term(file_path: &str) -> Vec<DictTerm> {
let html_content = fs::read_to_string(file_path).expect("Should have been able to read the file");
let document = Document::from(html_content.as_str());
document.find(Name("idx:entry")).map(|element| {
let dict_term = element.find(Name("idx:orth")).next().unwrap().attr("value").unwrap();
DictTerm {
term: dict_term.to_owned(),
definition: element.text().replace("\"", "-")
}
}).collect::<Vec<_>>()
}