mirror of
https://github.com/freedit-org/freedit.git
synced 2026-01-10 04:58:07 -05:00
[wip] search with tantivy
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -5,4 +5,5 @@
|
||||
.rustc_info.json
|
||||
.rustdoc_fingerprint.json
|
||||
config.toml
|
||||
/snapshots
|
||||
/snapshots
|
||||
/tantivy
|
||||
@@ -11,6 +11,7 @@ use crate::{
|
||||
},
|
||||
meta_handler::{handler_404, home, style},
|
||||
notification::notification,
|
||||
search::search,
|
||||
solo::{solo, solo_delete, solo_like, solo_list, solo_post},
|
||||
upload::{gallery, upload, upload_pic_post, upload_post},
|
||||
user::{
|
||||
@@ -104,6 +105,7 @@ pub async fn router(db: Db) -> Router {
|
||||
.route("/feed/star/:item_id", get(feed_star))
|
||||
.route("/feed/subscribe/:uid/:item_id", get(feed_subscribe))
|
||||
.route("/feed/read/:item_id", get(feed_read))
|
||||
.route("/search", get(search))
|
||||
.with_state(db);
|
||||
|
||||
let mut router_static = Router::new()
|
||||
|
||||
@@ -15,6 +15,7 @@ pub struct Config {
|
||||
pub(crate) avatars_path: String,
|
||||
pub(crate) inn_icons_path: String,
|
||||
pub(crate) upload_path: String,
|
||||
pub(crate) tantivy_path: String,
|
||||
pub(crate) serve_dir: Vec<(String, String, String)>,
|
||||
cert: String,
|
||||
key: String,
|
||||
@@ -40,6 +41,7 @@ impl Config {
|
||||
check_path(&config.avatars_path);
|
||||
check_path(&config.inn_icons_path);
|
||||
check_path(&config.upload_path);
|
||||
check_path(&config.tantivy_path);
|
||||
|
||||
config
|
||||
}
|
||||
@@ -59,9 +61,10 @@ impl Default for Config {
|
||||
Config {
|
||||
db: "freedit.db".into(),
|
||||
addr: "127.0.0.1:3001".into(),
|
||||
avatars_path: "./static/imgs/avatars".into(),
|
||||
inn_icons_path: "./static/imgs/inn_icons".into(),
|
||||
upload_path: "./static/imgs/upload".into(),
|
||||
avatars_path: "static/imgs/avatars".into(),
|
||||
inn_icons_path: "static/imgs/inn_icons".into(),
|
||||
upload_path: "static/imgs/upload".into(),
|
||||
tantivy_path: "tantivy".into(),
|
||||
serve_dir: vec![],
|
||||
cert: "".into(),
|
||||
key: "".into(),
|
||||
@@ -74,6 +77,8 @@ fn check_path(path_str: &str) {
|
||||
let path = Path::new(path_str);
|
||||
if !path.exists() {
|
||||
fs::create_dir_all(path).unwrap();
|
||||
info!("create path: {}", path_str);
|
||||
} else {
|
||||
info!("{path_str} is ok");
|
||||
}
|
||||
info!("static path {path_str}");
|
||||
}
|
||||
|
||||
@@ -632,29 +632,29 @@ pub(super) async fn update(
|
||||
let content = CLIENT.get(url).send().await?.bytes().await?;
|
||||
|
||||
let item_links_tree = db.open_tree("item_links")?;
|
||||
let tan_tree = db.open_tree("tan")?;
|
||||
let mut item_ids = vec![];
|
||||
let feed = match rss::Channel::read_from(&content[..]) {
|
||||
Ok(rss) => {
|
||||
for item in rss.items.into_iter().take(n) {
|
||||
let source_item: SourceItem = item.try_into()?;
|
||||
let item_id = if let Some(v) = item_links_tree.get(&source_item.link)? {
|
||||
ivec_to_u32(&v)
|
||||
} else {
|
||||
incr_id(db, "items_count")?
|
||||
};
|
||||
|
||||
let item = Item {
|
||||
link: source_item.link,
|
||||
title: source_item.title,
|
||||
feed_title: rss.title.clone(),
|
||||
updated: source_item.updated,
|
||||
content: source_item.content,
|
||||
};
|
||||
if let None = item_links_tree.get(&source_item.link)? {
|
||||
let item_id = incr_id(db, "items_count")?;
|
||||
let item = Item {
|
||||
link: source_item.link,
|
||||
title: source_item.title,
|
||||
feed_title: rss.title.clone(),
|
||||
updated: source_item.updated,
|
||||
content: source_item.content,
|
||||
};
|
||||
|
||||
item_links_tree.insert(&item.link, u32_to_ivec(item_id))?;
|
||||
set_one(db, "items", item_id, &item)?;
|
||||
item_links_tree.insert(&item.link, u32_to_ivec(item_id))?;
|
||||
set_one(db, "items", item_id, &item)?;
|
||||
|
||||
item_ids.push((item_id, item.updated));
|
||||
item_ids.push((item_id, item.updated));
|
||||
tan_tree.insert(format!("item{}", item_id), &[])?;
|
||||
}
|
||||
}
|
||||
|
||||
Feed {
|
||||
@@ -666,22 +666,22 @@ pub(super) async fn update(
|
||||
Ok(atom) => {
|
||||
for entry in atom.entries.into_iter().take(n) {
|
||||
let source_item: SourceItem = entry.into();
|
||||
let item_id = if let Some(v) = item_links_tree.get(&source_item.link)? {
|
||||
ivec_to_u32(&v)
|
||||
} else {
|
||||
incr_id(db, "items_count")?
|
||||
};
|
||||
let item = Item {
|
||||
link: source_item.link,
|
||||
title: source_item.title,
|
||||
feed_title: atom.title.to_string(),
|
||||
updated: source_item.updated,
|
||||
content: source_item.content,
|
||||
};
|
||||
item_links_tree.insert(&item.link, u32_to_ivec(item_id))?;
|
||||
set_one(db, "items", item_id, &item)?;
|
||||
if let None = item_links_tree.get(&source_item.link)? {
|
||||
let item_id = incr_id(db, "items_count")?;
|
||||
let item = Item {
|
||||
link: source_item.link,
|
||||
title: source_item.title,
|
||||
feed_title: atom.title.to_string(),
|
||||
updated: source_item.updated,
|
||||
content: source_item.content,
|
||||
};
|
||||
|
||||
item_ids.push((item_id, item.updated));
|
||||
item_links_tree.insert(&item.link, u32_to_ivec(item_id))?;
|
||||
set_one(db, "items", item_id, &item)?;
|
||||
|
||||
item_ids.push((item_id, item.updated));
|
||||
tan_tree.insert(format!("item{}", item_id), &[])?;
|
||||
};
|
||||
}
|
||||
|
||||
Feed {
|
||||
|
||||
@@ -709,6 +709,12 @@ pub(crate) async fn edit_post_post(
|
||||
User::update_stats(&db, claim.uid, "post")?;
|
||||
claim.update_last_write(&db)?;
|
||||
|
||||
if inn.inn_type.as_str() != "Private" {
|
||||
let is_update: &[u8] = if old_pid == 0 { &[] } else { &[0] };
|
||||
db.open_tree("tan")?
|
||||
.insert(format!("post{}", pid), is_update)?;
|
||||
}
|
||||
|
||||
let target = format!("/post/{iid}/{pid}");
|
||||
Ok(Redirect::to(&target))
|
||||
}
|
||||
@@ -1575,6 +1581,12 @@ pub(crate) async fn comment_post(
|
||||
User::update_stats(&db, claim.uid, "comment")?;
|
||||
claim.update_last_write(&db)?;
|
||||
|
||||
let inn: Inn = get_one(&db, "inns", iid)?;
|
||||
if inn.inn_type != "Private" {
|
||||
db.open_tree("tan")?
|
||||
.insert(format!("comt{}#{}", pid, cid), &[])?;
|
||||
}
|
||||
|
||||
let target = format!("/post/{iid}/{pid}");
|
||||
Ok(Redirect::to(&target))
|
||||
}
|
||||
@@ -1643,7 +1655,10 @@ pub(crate) async fn comment_delete(
|
||||
|
||||
inn_add_index(&db, iid, pid, timestamp as u32, visibility)?;
|
||||
|
||||
let target = format!("/post/{iid}/{pid}");
|
||||
db.open_tree("tan")?
|
||||
.remove(format!("comt{}#{}", pid, cid))?;
|
||||
|
||||
let target = format!("/post/{pid}/{cid}");
|
||||
Ok(Redirect::to(&target))
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,8 @@
|
||||
//! | "user_stats" | `timestamp_uid_type` | N |
|
||||
//! | "user_uploads" | `uid#img_id` | `image_hash.ext` |
|
||||
//! | default | "imgs_count" | N |
|
||||
//! | home_pages | `uid` | `u8` |
|
||||
//! | "home_pages" | `uid` | `u8` |
|
||||
//! | "tan" | `content_type#id` | `&[]`or &[0] |
|
||||
//!
|
||||
//! ### notification
|
||||
//! | tree | key | value |
|
||||
@@ -107,21 +108,24 @@ pub mod db_utils;
|
||||
pub mod feed;
|
||||
pub mod meta_handler;
|
||||
pub mod notification;
|
||||
pub mod tantivy;
|
||||
|
||||
pub(super) mod admin;
|
||||
pub(super) mod inn;
|
||||
pub(super) mod search;
|
||||
pub(super) mod solo;
|
||||
pub(super) mod upload;
|
||||
pub(super) mod user;
|
||||
|
||||
mod fmt;
|
||||
mod tantity;
|
||||
|
||||
use self::db_utils::{
|
||||
get_ids_by_prefix, get_one, incr_id, ivec_to_u32, u32_to_ivec, u8_slice_to_u32,
|
||||
};
|
||||
use self::fmt::md2html;
|
||||
use self::fmt::{md2html, ts_to_date};
|
||||
use self::tantivy::{ToDoc, FIELDS};
|
||||
use crate::{controller::meta_handler::into_response, error::AppError};
|
||||
use ::tantivy::Document;
|
||||
use bincode::config::standard;
|
||||
use bincode::{Decode, Encode};
|
||||
use chrono::{Days, Utc};
|
||||
@@ -223,6 +227,18 @@ struct Solo {
|
||||
replies: Vec<u32>,
|
||||
}
|
||||
|
||||
impl ToDoc for Solo {
|
||||
fn to_doc(&self, _id: Option<u32>) -> Document {
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(FIELDS.id, format!("solo{}", self.sid));
|
||||
doc.add_text(FIELDS.title, &self.content);
|
||||
doc.add_text(FIELDS.time, ts_to_date(self.created_at));
|
||||
doc.add_u64(FIELDS.uid, self.uid as u64);
|
||||
doc.add_text(FIELDS.content_type, "solo");
|
||||
doc
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Encode, Decode, Serialize, Debug)]
|
||||
struct Inn {
|
||||
iid: u32,
|
||||
@@ -289,6 +305,19 @@ pub struct Post {
|
||||
pub status: PostStatus,
|
||||
}
|
||||
|
||||
impl ToDoc for Post {
|
||||
fn to_doc(&self, _id: Option<u32>) -> Document {
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(FIELDS.id, format!("post{}", self.pid));
|
||||
doc.add_text(FIELDS.title, &self.title);
|
||||
doc.add_text(FIELDS.time, ts_to_date(self.created_at));
|
||||
doc.add_u64(FIELDS.uid, self.uid as u64);
|
||||
doc.add_text(FIELDS.content, &self.content);
|
||||
doc.add_text(FIELDS.content_type, "post");
|
||||
doc
|
||||
}
|
||||
}
|
||||
|
||||
/// Form data: `/inn/:iid/post/:pid` post create/edit page
|
||||
#[derive(Debug, Default, Deserialize, Validate, Encode, Decode)]
|
||||
pub(super) struct FormPost {
|
||||
@@ -314,6 +343,18 @@ struct Comment {
|
||||
is_hidden: bool,
|
||||
}
|
||||
|
||||
impl ToDoc for Comment {
|
||||
fn to_doc(&self, _id: Option<u32>) -> Document {
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(FIELDS.id, format!("comt{}#{}", self.pid, self.cid));
|
||||
doc.add_text(FIELDS.title, &self.content);
|
||||
doc.add_text(FIELDS.time, ts_to_date(self.created_at));
|
||||
doc.add_u64(FIELDS.uid, self.uid as u64);
|
||||
doc.add_text(FIELDS.content_type, "comt");
|
||||
doc
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Encode, Decode, Debug)]
|
||||
struct Feed {
|
||||
link: String,
|
||||
@@ -329,6 +370,18 @@ struct Item {
|
||||
content: String,
|
||||
}
|
||||
|
||||
impl ToDoc for Item {
|
||||
fn to_doc(&self, id: Option<u32>) -> Document {
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(FIELDS.id, format!("item{}", id.unwrap()));
|
||||
doc.add_text(FIELDS.title, &self.title);
|
||||
doc.add_text(FIELDS.time, ts_to_date(self.updated));
|
||||
doc.add_text(FIELDS.content, &self.content);
|
||||
doc.add_text(FIELDS.content_type, "item");
|
||||
doc
|
||||
}
|
||||
}
|
||||
|
||||
/// Go to source code to see default value: [SiteConfig::default()]
|
||||
#[derive(Serialize, Deserialize, Encode, Decode, Validate, Debug)]
|
||||
pub(super) struct SiteConfig {
|
||||
|
||||
36
src/controller/search.rs
Normal file
36
src/controller/search.rs
Normal file
@@ -0,0 +1,36 @@
|
||||
use axum::{extract::Query, response::IntoResponse};
|
||||
use serde::Deserialize;
|
||||
use tantivy::{collector::TopDocs, DocAddress, Score};
|
||||
|
||||
use crate::error::AppError;
|
||||
|
||||
use super::tantivy::SEARCHER;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub(crate) struct ParamsSearch {
|
||||
search: String,
|
||||
offset: Option<usize>,
|
||||
}
|
||||
|
||||
pub(crate) async fn search(Query(input): Query<ParamsSearch>) -> impl IntoResponse {
|
||||
let offset = input.offset.unwrap_or_default();
|
||||
let search = input.search.trim();
|
||||
|
||||
if !search.is_empty() {
|
||||
let Ok(query) = SEARCHER.query_parser.parse_query(&search)else{
|
||||
return AppError::Custom("Please remove special chars".to_owned()).into_response();
|
||||
};
|
||||
|
||||
let searcher = SEARCHER.reader.searcher();
|
||||
let top_docs: Vec<(Score, DocAddress)> = searcher
|
||||
.search(&query, &TopDocs::with_limit(20).and_offset(offset))
|
||||
.unwrap_or_default();
|
||||
|
||||
for (_score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address).unwrap();
|
||||
println!("{}", SEARCHER.schema.to_json(&retrieved_doc));
|
||||
}
|
||||
}
|
||||
|
||||
return ().into_response();
|
||||
}
|
||||
@@ -470,6 +470,10 @@ pub(crate) async fn solo_post(
|
||||
User::update_stats(&db, claim.uid, "solo")?;
|
||||
claim.update_last_write(&db)?;
|
||||
|
||||
if visibility == 0 {
|
||||
db.open_tree("tan")?.insert(format!("solo{}", sid), &[])?;
|
||||
}
|
||||
|
||||
let target = if input.reply_to > 0 {
|
||||
format!("/solo/{}", input.reply_to)
|
||||
} else {
|
||||
@@ -554,6 +558,8 @@ pub(crate) async fn solo_delete(
|
||||
let k = [&u32_to_ivec(claim.uid), &sid_ivec].concat();
|
||||
db.open_tree("user_solos")?.remove(k)?;
|
||||
|
||||
db.open_tree("tan")?.remove(format!("solo{}", sid))?;
|
||||
|
||||
let target = format!("/solo/user/{}", solo.uid);
|
||||
Ok(Redirect::to(&target))
|
||||
}
|
||||
|
||||
@@ -1,117 +0,0 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use jieba_rs::{Jieba, TokenizeMode};
|
||||
use once_cell::sync::Lazy;
|
||||
use rust_stemmers::{Algorithm, Stemmer};
|
||||
use tantivy::tokenizer::{
|
||||
BoxTokenStream, RemoveLongFilter, TextAnalyzer, Token, TokenStream, Tokenizer,
|
||||
};
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
use whichlang::detect_language;
|
||||
|
||||
const MULTI_LINGO_TOKENIZER: &str = "multi_lingo_tokenizer";
|
||||
|
||||
fn get_tokenizer() -> TextAnalyzer {
|
||||
TextAnalyzer::from(MultiLingoTokenizer).filter(RemoveLongFilter::limit(30))
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct MultiLingoTokenizer;
|
||||
|
||||
impl Tokenizer for MultiLingoTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
if text.is_empty() {
|
||||
return BoxTokenStream::from(MultiLingoTokenStream {
|
||||
tokens: vec![],
|
||||
index: 0,
|
||||
});
|
||||
}
|
||||
|
||||
let tokens = pre_tokenize_text(text);
|
||||
BoxTokenStream::from(MultiLingoTokenStream { tokens, index: 0 })
|
||||
}
|
||||
}
|
||||
|
||||
struct MultiLingoTokenStream {
|
||||
tokens: Vec<Token>,
|
||||
index: usize,
|
||||
}
|
||||
|
||||
impl TokenStream for MultiLingoTokenStream {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.index < self.tokens.len() {
|
||||
self.index += 1;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.tokens[self.index - 1]
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.tokens[self.index - 1]
|
||||
}
|
||||
}
|
||||
|
||||
static JIEBA: Lazy<Jieba> = Lazy::new(Jieba::new);
|
||||
static STEMMER_ENG: Lazy<Stemmer> = Lazy::new(|| Stemmer::create(Algorithm::English));
|
||||
|
||||
fn pre_tokenize_text(text: &str) -> Vec<Token> {
|
||||
let mut tokens = Vec::new();
|
||||
match detect_language(text) {
|
||||
whichlang::Lang::Eng => {
|
||||
for (idx, (offset, word)) in text.unicode_word_indices().enumerate() {
|
||||
let word = word.to_lowercase();
|
||||
if !STOP_WORDS_ENG.contains(&word) {
|
||||
tokens.push(Token {
|
||||
offset_from: offset,
|
||||
offset_to: offset + word.len(),
|
||||
position: idx,
|
||||
text: STEMMER_ENG.stem(&word).to_string(),
|
||||
position_length: 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
whichlang::Lang::Cmn => {
|
||||
let text = fast2s::convert(text);
|
||||
let orig_tokens = JIEBA.tokenize(&text, TokenizeMode::Search, true);
|
||||
let mut indices = text.char_indices().collect::<Vec<_>>();
|
||||
indices.push((text.len(), '\0'));
|
||||
|
||||
for token in orig_tokens {
|
||||
if !STOP_WORDS_CMN.contains(token.word) {
|
||||
tokens.push(Token {
|
||||
offset_from: indices[token.start].0,
|
||||
offset_to: indices[token.end].0,
|
||||
position: token.start,
|
||||
text: token.word.to_lowercase(),
|
||||
position_length: 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_ => todo!(),
|
||||
}
|
||||
|
||||
tokens
|
||||
}
|
||||
|
||||
static STOP_WORDS_ENG: Lazy<HashSet<String>> = Lazy::new(|| {
|
||||
stop_words::get(stop_words::LANGUAGE::English)
|
||||
.into_iter()
|
||||
.collect()
|
||||
});
|
||||
|
||||
static STOP_WORDS_CMN: Lazy<HashSet<String>> = Lazy::new(|| {
|
||||
let mut set: HashSet<_> = stop_words::get(stop_words::LANGUAGE::Chinese)
|
||||
.into_iter()
|
||||
.collect();
|
||||
set.insert(" ".to_string());
|
||||
set
|
||||
});
|
||||
284
src/controller/tantivy.rs
Normal file
284
src/controller/tantivy.rs
Normal file
@@ -0,0 +1,284 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use bincode::config::standard;
|
||||
use jieba_rs::{Jieba, TokenizeMode};
|
||||
use once_cell::sync::Lazy;
|
||||
use rust_stemmers::{Algorithm, Stemmer};
|
||||
use sled::Db;
|
||||
use tantivy::{
|
||||
directory::MmapDirectory,
|
||||
query::QueryParser,
|
||||
schema::{
|
||||
Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST,
|
||||
INDEXED, STORED, STRING,
|
||||
},
|
||||
tokenizer::{BoxTokenStream, Token, TokenStream, Tokenizer},
|
||||
Document, Index, IndexReader, IndexWriter, Term,
|
||||
};
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
use whichlang::detect_language;
|
||||
|
||||
use crate::{config::CONFIG, error::AppError};
|
||||
|
||||
use super::{
|
||||
db_utils::{get_one, u32_to_ivec},
|
||||
Comment, Item, Post, Solo,
|
||||
};
|
||||
|
||||
pub(super) trait ToDoc {
|
||||
fn to_doc(&self, id: Option<u32>) -> Document;
|
||||
}
|
||||
|
||||
pub(super) static SEARCHER: Lazy<Searcher> = Lazy::new(|| Tan::get_searcher().unwrap());
|
||||
pub(super) static FIELDS: Lazy<Fields> = Lazy::new(|| Tan::set_schema().1);
|
||||
|
||||
pub struct Tan {
|
||||
writer: IndexWriter,
|
||||
}
|
||||
|
||||
pub(super) struct Searcher {
|
||||
pub(super) reader: IndexReader,
|
||||
pub(super) query_parser: QueryParser,
|
||||
pub(super) schema: Schema,
|
||||
}
|
||||
|
||||
pub(super) struct Fields {
|
||||
pub(super) id: Field,
|
||||
pub(super) title: Field,
|
||||
pub(super) time: Field,
|
||||
pub(super) uid: Field,
|
||||
pub(super) content: Field,
|
||||
pub(super) content_type: Field,
|
||||
}
|
||||
|
||||
impl Tan {
|
||||
fn set_schema() -> (Schema, Fields) {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
let text_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer(MULTI_LINGO_TOKENIZER)
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options_stored = TextOptions::default()
|
||||
.set_indexing_options(text_indexing.clone())
|
||||
.set_stored();
|
||||
let text_options_nostored = TextOptions::default().set_indexing_options(text_indexing);
|
||||
|
||||
let id = schema_builder.add_text_field("id", STRING | STORED);
|
||||
let title = schema_builder.add_text_field("title", text_options_stored);
|
||||
let time = schema_builder.add_text_field("time", STORED);
|
||||
let uid = schema_builder.add_u64_field("uid", STORED | INDEXED);
|
||||
let content = schema_builder.add_text_field("content", text_options_nostored);
|
||||
let content_type = schema_builder.add_text_field("content_type", FAST | STRING);
|
||||
|
||||
let fields = Fields {
|
||||
id,
|
||||
title,
|
||||
time,
|
||||
uid,
|
||||
content,
|
||||
content_type,
|
||||
};
|
||||
let schema = schema_builder.build();
|
||||
|
||||
(schema, fields)
|
||||
}
|
||||
|
||||
fn get_index() -> tantivy::Result<Index> {
|
||||
let (schema, _) = Tan::set_schema();
|
||||
let index = tantivy::Index::open_or_create(
|
||||
MmapDirectory::open(&CONFIG.tantivy_path).unwrap(),
|
||||
schema,
|
||||
)?;
|
||||
let tokenizer = MultiLingoTokenizer {};
|
||||
index
|
||||
.tokenizers()
|
||||
.register(MULTI_LINGO_TOKENIZER, tokenizer);
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
pub fn init() -> tantivy::Result<Self> {
|
||||
let index = Tan::get_index()?;
|
||||
let writer = index.writer(50 * 1024 * 1024)?;
|
||||
Ok(Tan { writer })
|
||||
}
|
||||
|
||||
fn get_searcher() -> tantivy::Result<Searcher> {
|
||||
let (schema, _) = Tan::set_schema();
|
||||
let index = Tan::get_index()?;
|
||||
let reader = index.reader().unwrap();
|
||||
let mut query_parser = QueryParser::for_index(&index, vec![FIELDS.title, FIELDS.content]);
|
||||
query_parser.set_conjunction_by_default();
|
||||
query_parser.set_field_boost(FIELDS.title, 2.);
|
||||
|
||||
Ok(Searcher {
|
||||
reader,
|
||||
query_parser,
|
||||
schema,
|
||||
})
|
||||
}
|
||||
|
||||
/// id should be `post123` `comt45#1` `solo123` or `item123`
|
||||
///
|
||||
/// It just add doc to tantivy, not commit.
|
||||
pub fn add_doc(&mut self, id: String, db: Db) -> Result<(), AppError> {
|
||||
let content_type = &id[0..4];
|
||||
let ids: Vec<_> = id[4..].split("#").collect();
|
||||
let id1: u32 = ids[0].parse().unwrap();
|
||||
let doc = match content_type {
|
||||
"post" => {
|
||||
let post: Post = get_one(&db, "posts", id1)?;
|
||||
post.to_doc(None)
|
||||
}
|
||||
"comt" => {
|
||||
let id2: u32 = ids[1].parse().unwrap();
|
||||
let k = [&u32_to_ivec(id1), &u32_to_ivec(id2)].concat();
|
||||
let v = db
|
||||
.open_tree("post_comments")?
|
||||
.get(k)?
|
||||
.ok_or(AppError::NotFound)?;
|
||||
let (comment, _): (Comment, usize) = bincode::decode_from_slice(&v, standard())?;
|
||||
comment.to_doc(None)
|
||||
}
|
||||
"solo" => {
|
||||
let solo: Solo = get_one(&db, "solos", id1)?;
|
||||
solo.to_doc(None)
|
||||
}
|
||||
"item" => {
|
||||
let item: Item = get_one(&db, "items", id1)?;
|
||||
item.to_doc(Some(id1))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
self.writer.add_document(doc)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn del_index(&mut self, id: &str) -> tantivy::Result<()> {
|
||||
self.writer
|
||||
.delete_term(Term::from_field_text(FIELDS.id, id));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn commit(&mut self) -> tantivy::Result<()> {
|
||||
self.writer.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
const MULTI_LINGO_TOKENIZER: &str = "multi_lingo_tokenizer";
|
||||
|
||||
#[derive(Clone)]
|
||||
struct MultiLingoTokenizer;
|
||||
|
||||
impl Tokenizer for MultiLingoTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
if text.is_empty() {
|
||||
return BoxTokenStream::from(MultiLingoTokenStream {
|
||||
tokens: vec![],
|
||||
index: 0,
|
||||
});
|
||||
}
|
||||
|
||||
let tokens = pre_tokenize_text(text);
|
||||
BoxTokenStream::from(MultiLingoTokenStream { tokens, index: 0 })
|
||||
}
|
||||
}
|
||||
|
||||
struct MultiLingoTokenStream {
|
||||
tokens: Vec<Token>,
|
||||
index: usize,
|
||||
}
|
||||
|
||||
impl TokenStream for MultiLingoTokenStream {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.index < self.tokens.len() {
|
||||
self.index += 1;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.tokens[self.index - 1]
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.tokens[self.index - 1]
|
||||
}
|
||||
}
|
||||
|
||||
static JIEBA: Lazy<Jieba> = Lazy::new(Jieba::new);
|
||||
static STEMMER_ENG: Lazy<Stemmer> = Lazy::new(|| Stemmer::create(Algorithm::English));
|
||||
|
||||
fn pre_tokenize_text(text: &str) -> Vec<Token> {
|
||||
let mut tokens = Vec::with_capacity(text.len() / 4);
|
||||
match detect_language(text) {
|
||||
whichlang::Lang::Eng => {
|
||||
for (idx, (offset, word)) in text.unicode_word_indices().enumerate() {
|
||||
let word = word.to_lowercase();
|
||||
if !STOP_WORDS_ENG.contains(&word) && word.len() <= 30 {
|
||||
tokens.push(Token {
|
||||
offset_from: offset,
|
||||
offset_to: offset + word.len(),
|
||||
position: idx,
|
||||
text: STEMMER_ENG.stem(&word).to_string(),
|
||||
position_length: 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
whichlang::Lang::Cmn => {
|
||||
let text = fast2s::convert(text);
|
||||
let orig_tokens = JIEBA.tokenize(&text, TokenizeMode::Search, true);
|
||||
let mut indices = text.char_indices().collect::<Vec<_>>();
|
||||
indices.push((text.len(), '\0'));
|
||||
|
||||
for token in orig_tokens {
|
||||
if !STOP_WORDS_CMN.contains(token.word) && token.word.len() <= 30 {
|
||||
tokens.push(Token {
|
||||
offset_from: indices[token.start].0,
|
||||
offset_to: indices[token.end].0,
|
||||
position: token.start,
|
||||
text: token.word.to_lowercase(),
|
||||
position_length: 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_ => {
|
||||
for (idx, (offset, word)) in text.unicode_word_indices().enumerate() {
|
||||
let word = word.to_lowercase();
|
||||
if word.len() <= 30 {
|
||||
tokens.push(Token {
|
||||
offset_from: offset,
|
||||
offset_to: offset + word.len(),
|
||||
position: idx,
|
||||
text: word,
|
||||
position_length: 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tokens
|
||||
}
|
||||
|
||||
static STOP_WORDS_ENG: Lazy<HashSet<String>> = Lazy::new(|| {
|
||||
stop_words::get(stop_words::LANGUAGE::English)
|
||||
.into_iter()
|
||||
.collect()
|
||||
});
|
||||
|
||||
static STOP_WORDS_CMN: Lazy<HashSet<String>> = Lazy::new(|| {
|
||||
let mut set: HashSet<_> = stop_words::get(stop_words::LANGUAGE::Chinese)
|
||||
.into_iter()
|
||||
.collect();
|
||||
set.insert(" ".to_string());
|
||||
set
|
||||
});
|
||||
@@ -15,6 +15,8 @@ pub enum AppError {
|
||||
IoError(#[from] std::io::Error),
|
||||
#[error("You must join inn first")]
|
||||
NoJoinedInn,
|
||||
#[error(transparent)]
|
||||
TantivyError(#[from] tantivy::TantivyError),
|
||||
|
||||
// 4XX
|
||||
#[error("Captcha Error")]
|
||||
|
||||
39
src/main.rs
39
src/main.rs
@@ -4,7 +4,9 @@ use chrono::Utc;
|
||||
use freedit::{
|
||||
app_router::router,
|
||||
config::CONFIG,
|
||||
controller::{db_utils::clear_invalid, feed::cron_feed, meta_handler::shutdown_signal},
|
||||
controller::{
|
||||
db_utils::clear_invalid, feed::cron_feed, meta_handler::shutdown_signal, tantivy::Tan,
|
||||
},
|
||||
error::AppError,
|
||||
CURRENT_SHA256, GIT_COMMIT, VERSION,
|
||||
};
|
||||
@@ -58,7 +60,40 @@ async fn main() -> Result<(), AppError> {
|
||||
if let Err(e) = clear_invalid(&db2, "user_stats").await {
|
||||
error!(%e);
|
||||
}
|
||||
sleep_seconds(3600 * 8).await;
|
||||
sleep_seconds(3600 * 4).await;
|
||||
}
|
||||
});
|
||||
|
||||
let db2 = db.clone();
|
||||
tokio::spawn(async move {
|
||||
// TODO:
|
||||
// 1. inn feed search
|
||||
// 2. rebuild whole search
|
||||
let mut tan = Tan::init().unwrap();
|
||||
let mut subscriber = db2.open_tree("tan").unwrap().watch_prefix(vec![]);
|
||||
while let Some(event) = (&mut subscriber).await {
|
||||
let db2 = db2.clone();
|
||||
let (k, op_type) = match event {
|
||||
sled::Event::Insert { key, value } => {
|
||||
if value.len() == 1 {
|
||||
(key, "update")
|
||||
} else {
|
||||
(key, "add")
|
||||
}
|
||||
}
|
||||
sled::Event::Remove { key } => (key, "delete"),
|
||||
};
|
||||
let id = String::from_utf8_lossy(&k);
|
||||
|
||||
if op_type == "delete" || op_type == "update" {
|
||||
tan.del_index(&id).unwrap();
|
||||
}
|
||||
|
||||
if op_type == "update" || op_type == "add" {
|
||||
tan.add_doc(id.into(), db2).unwrap();
|
||||
}
|
||||
|
||||
tan.commit().unwrap();
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user