[wip] search with tantivy

2026-01-10 04:58:07 -05:00 · 2023-06-01 19:20:26 +08:00
parent aaeaaf4dde
commit 19491d43f5
12 changed files with 480 additions and 158 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,5 @@
 .rustc_info.json
 .rustdoc_fingerprint.json
 config.toml
-/snapshots
+/snapshots
+/tantivy
--- a/src/app_router.rs
+++ b/src/app_router.rs
@@ -11,6 +11,7 @@ use crate::{
        },
        meta_handler::{handler_404, home, style},
        notification::notification,
+        search::search,
        solo::{solo, solo_delete, solo_like, solo_list, solo_post},
        upload::{gallery, upload, upload_pic_post, upload_post},
        user::{
@@ -104,6 +105,7 @@ pub async fn router(db: Db) -> Router {
        .route("/feed/star/:item_id", get(feed_star))
        .route("/feed/subscribe/:uid/:item_id", get(feed_subscribe))
        .route("/feed/read/:item_id", get(feed_read))
+        .route("/search", get(search))
        .with_state(db);

    let mut router_static = Router::new()
--- a/src/config.rs
+++ b/src/config.rs
@@ -15,6 +15,7 @@ pub struct Config {
    pub(crate) avatars_path: String,
    pub(crate) inn_icons_path: String,
    pub(crate) upload_path: String,
+    pub(crate) tantivy_path: String,
    pub(crate) serve_dir: Vec<(String, String, String)>,
    cert: String,
    key: String,
@@ -40,6 +41,7 @@ impl Config {
        check_path(&config.avatars_path);
        check_path(&config.inn_icons_path);
        check_path(&config.upload_path);
+        check_path(&config.tantivy_path);

        config
    }
@@ -59,9 +61,10 @@ impl Default for Config {
        Config {
            db: "freedit.db".into(),
            addr: "127.0.0.1:3001".into(),
-            avatars_path: "./static/imgs/avatars".into(),
-            inn_icons_path: "./static/imgs/inn_icons".into(),
-            upload_path: "./static/imgs/upload".into(),
+            avatars_path: "static/imgs/avatars".into(),
+            inn_icons_path: "static/imgs/inn_icons".into(),
+            upload_path: "static/imgs/upload".into(),
+            tantivy_path: "tantivy".into(),
            serve_dir: vec![],
            cert: "".into(),
            key: "".into(),
@@ -74,6 +77,8 @@ fn check_path(path_str: &str) {
    let path = Path::new(path_str);
    if !path.exists() {
        fs::create_dir_all(path).unwrap();
+        info!("create path: {}", path_str);
+    } else {
+        info!("{path_str} is ok");
    }
-    info!("static path {path_str}");
 }
--- a/src/controller/feed.rs
+++ b/src/controller/feed.rs
@@ -632,29 +632,29 @@ pub(super) async fn update(
    let content = CLIENT.get(url).send().await?.bytes().await?;

    let item_links_tree = db.open_tree("item_links")?;
+    let tan_tree = db.open_tree("tan")?;
    let mut item_ids = vec![];
    let feed = match rss::Channel::read_from(&content[..]) {
        Ok(rss) => {
            for item in rss.items.into_iter().take(n) {
                let source_item: SourceItem = item.try_into()?;
-                let item_id = if let Some(v) = item_links_tree.get(&source_item.link)? {
-                    ivec_to_u32(&v)
-                } else {
-                    incr_id(db, "items_count")?
-                };

-                let item = Item {
-                    link: source_item.link,
-                    title: source_item.title,
-                    feed_title: rss.title.clone(),
-                    updated: source_item.updated,
-                    content: source_item.content,
-                };
+                if let None = item_links_tree.get(&source_item.link)? {
+                    let item_id = incr_id(db, "items_count")?;
+                    let item = Item {
+                        link: source_item.link,
+                        title: source_item.title,
+                        feed_title: rss.title.clone(),
+                        updated: source_item.updated,
+                        content: source_item.content,
+                    };

-                item_links_tree.insert(&item.link, u32_to_ivec(item_id))?;
-                set_one(db, "items", item_id, &item)?;
+                    item_links_tree.insert(&item.link, u32_to_ivec(item_id))?;
+                    set_one(db, "items", item_id, &item)?;

-                item_ids.push((item_id, item.updated));
+                    item_ids.push((item_id, item.updated));
+                    tan_tree.insert(format!("item{}", item_id), &[])?;
+                }
            }

            Feed {
@@ -666,22 +666,22 @@ pub(super) async fn update(
            Ok(atom) => {
                for entry in atom.entries.into_iter().take(n) {
                    let source_item: SourceItem = entry.into();
-                    let item_id = if let Some(v) = item_links_tree.get(&source_item.link)? {
-                        ivec_to_u32(&v)
-                    } else {
-                        incr_id(db, "items_count")?
-                    };
-                    let item = Item {
-                        link: source_item.link,
-                        title: source_item.title,
-                        feed_title: atom.title.to_string(),
-                        updated: source_item.updated,
-                        content: source_item.content,
-                    };
-                    item_links_tree.insert(&item.link, u32_to_ivec(item_id))?;
-                    set_one(db, "items", item_id, &item)?;
+                    if let None = item_links_tree.get(&source_item.link)? {
+                        let item_id = incr_id(db, "items_count")?;
+                        let item = Item {
+                            link: source_item.link,
+                            title: source_item.title,
+                            feed_title: atom.title.to_string(),
+                            updated: source_item.updated,
+                            content: source_item.content,
+                        };

-                    item_ids.push((item_id, item.updated));
+                        item_links_tree.insert(&item.link, u32_to_ivec(item_id))?;
+                        set_one(db, "items", item_id, &item)?;
+
+                        item_ids.push((item_id, item.updated));
+                        tan_tree.insert(format!("item{}", item_id), &[])?;
+                    };
                }

                Feed {
--- a/src/controller/inn.rs
+++ b/src/controller/inn.rs
@@ -709,6 +709,12 @@ pub(crate) async fn edit_post_post(
    User::update_stats(&db, claim.uid, "post")?;
    claim.update_last_write(&db)?;

+    if inn.inn_type.as_str() != "Private" {
+        let is_update: &[u8] = if old_pid == 0 { &[] } else { &[0] };
+        db.open_tree("tan")?
+            .insert(format!("post{}", pid), is_update)?;
+    }
+
    let target = format!("/post/{iid}/{pid}");
    Ok(Redirect::to(&target))
 }
@@ -1575,6 +1581,12 @@ pub(crate) async fn comment_post(
    User::update_stats(&db, claim.uid, "comment")?;
    claim.update_last_write(&db)?;

+    let inn: Inn = get_one(&db, "inns", iid)?;
+    if inn.inn_type != "Private" {
+        db.open_tree("tan")?
+            .insert(format!("comt{}#{}", pid, cid), &[])?;
+    }
+
    let target = format!("/post/{iid}/{pid}");
    Ok(Redirect::to(&target))
 }
@@ -1643,7 +1655,10 @@ pub(crate) async fn comment_delete(

    inn_add_index(&db, iid, pid, timestamp as u32, visibility)?;

-    let target = format!("/post/{iid}/{pid}");
+    db.open_tree("tan")?
+        .remove(format!("comt{}#{}", pid, cid))?;
+
+    let target = format!("/post/{pid}/{cid}");
    Ok(Redirect::to(&target))
 }

--- a/src/controller/mod.rs
+++ b/src/controller/mod.rs
@@ -14,7 +14,8 @@
 //! | "user_stats"     | `timestamp_uid_type` | N                |
 //! | "user_uploads"   | `uid#img_id`         | `image_hash.ext` |
 //! | default          | "imgs_count"         | N                |
-//! | home_pages       | `uid`                | `u8`             |
+//! | "home_pages"     | `uid`                | `u8`             |
+//! | "tan"            | `content_type#id`    | `&[]`or &[0]     |
 //!
 //! ### notification
 //! | tree            | key                   | value             |
@@ -107,21 +108,24 @@ pub mod db_utils;
 pub mod feed;
 pub mod meta_handler;
 pub mod notification;
+pub mod tantivy;

 pub(super) mod admin;
 pub(super) mod inn;
+pub(super) mod search;
 pub(super) mod solo;
 pub(super) mod upload;
 pub(super) mod user;

 mod fmt;
-mod tantity;

 use self::db_utils::{
    get_ids_by_prefix, get_one, incr_id, ivec_to_u32, u32_to_ivec, u8_slice_to_u32,
 };
-use self::fmt::md2html;
+use self::fmt::{md2html, ts_to_date};
+use self::tantivy::{ToDoc, FIELDS};
 use crate::{controller::meta_handler::into_response, error::AppError};
+use ::tantivy::Document;
 use bincode::config::standard;
 use bincode::{Decode, Encode};
 use chrono::{Days, Utc};
@@ -223,6 +227,18 @@ struct Solo {
    replies: Vec<u32>,
 }

+impl ToDoc for Solo {
+    fn to_doc(&self, _id: Option<u32>) -> Document {
+        let mut doc = Document::default();
+        doc.add_text(FIELDS.id, format!("solo{}", self.sid));
+        doc.add_text(FIELDS.title, &self.content);
+        doc.add_text(FIELDS.time, ts_to_date(self.created_at));
+        doc.add_u64(FIELDS.uid, self.uid as u64);
+        doc.add_text(FIELDS.content_type, "solo");
+        doc
+    }
+}
+
 #[derive(Encode, Decode, Serialize, Debug)]
 struct Inn {
    iid: u32,
@@ -289,6 +305,19 @@ pub struct Post {
    pub status: PostStatus,
 }

+impl ToDoc for Post {
+    fn to_doc(&self, _id: Option<u32>) -> Document {
+        let mut doc = Document::default();
+        doc.add_text(FIELDS.id, format!("post{}", self.pid));
+        doc.add_text(FIELDS.title, &self.title);
+        doc.add_text(FIELDS.time, ts_to_date(self.created_at));
+        doc.add_u64(FIELDS.uid, self.uid as u64);
+        doc.add_text(FIELDS.content, &self.content);
+        doc.add_text(FIELDS.content_type, "post");
+        doc
+    }
+}
+
 /// Form data: `/inn/:iid/post/:pid` post create/edit page
 #[derive(Debug, Default, Deserialize, Validate, Encode, Decode)]
 pub(super) struct FormPost {
@@ -314,6 +343,18 @@ struct Comment {
    is_hidden: bool,
 }

+impl ToDoc for Comment {
+    fn to_doc(&self, _id: Option<u32>) -> Document {
+        let mut doc = Document::default();
+        doc.add_text(FIELDS.id, format!("comt{}#{}", self.pid, self.cid));
+        doc.add_text(FIELDS.title, &self.content);
+        doc.add_text(FIELDS.time, ts_to_date(self.created_at));
+        doc.add_u64(FIELDS.uid, self.uid as u64);
+        doc.add_text(FIELDS.content_type, "comt");
+        doc
+    }
+}
+
 #[derive(Encode, Decode, Debug)]
 struct Feed {
    link: String,
@@ -329,6 +370,18 @@ struct Item {
    content: String,
 }

+impl ToDoc for Item {
+    fn to_doc(&self, id: Option<u32>) -> Document {
+        let mut doc = Document::default();
+        doc.add_text(FIELDS.id, format!("item{}", id.unwrap()));
+        doc.add_text(FIELDS.title, &self.title);
+        doc.add_text(FIELDS.time, ts_to_date(self.updated));
+        doc.add_text(FIELDS.content, &self.content);
+        doc.add_text(FIELDS.content_type, "item");
+        doc
+    }
+}
+
 /// Go to source code to see default value: [SiteConfig::default()]
 #[derive(Serialize, Deserialize, Encode, Decode, Validate, Debug)]
 pub(super) struct SiteConfig {
--- a/src/controller/search.rs
+++ b/src/controller/search.rs
@@ -0,0 +1,36 @@
+use axum::{extract::Query, response::IntoResponse};
+use serde::Deserialize;
+use tantivy::{collector::TopDocs, DocAddress, Score};
+
+use crate::error::AppError;
+
+use super::tantivy::SEARCHER;
+
+#[derive(Debug, Deserialize)]
+pub(crate) struct ParamsSearch {
+    search: String,
+    offset: Option<usize>,
+}
+
+pub(crate) async fn search(Query(input): Query<ParamsSearch>) -> impl IntoResponse {
+    let offset = input.offset.unwrap_or_default();
+    let search = input.search.trim();
+
+    if !search.is_empty() {
+        let Ok(query) = SEARCHER.query_parser.parse_query(&search)else{
+            return AppError::Custom("Please remove special chars".to_owned()).into_response();
+        };
+
+        let searcher = SEARCHER.reader.searcher();
+        let top_docs: Vec<(Score, DocAddress)> = searcher
+            .search(&query, &TopDocs::with_limit(20).and_offset(offset))
+            .unwrap_or_default();
+
+        for (_score, doc_address) in top_docs {
+            let retrieved_doc = searcher.doc(doc_address).unwrap();
+            println!("{}", SEARCHER.schema.to_json(&retrieved_doc));
+        }
+    }
+
+    return ().into_response();
+}
--- a/src/controller/solo.rs
+++ b/src/controller/solo.rs
@@ -470,6 +470,10 @@ pub(crate) async fn solo_post(
    User::update_stats(&db, claim.uid, "solo")?;
    claim.update_last_write(&db)?;

+    if visibility == 0 {
+        db.open_tree("tan")?.insert(format!("solo{}", sid), &[])?;
+    }
+
    let target = if input.reply_to > 0 {
        format!("/solo/{}", input.reply_to)
    } else {
@@ -554,6 +558,8 @@ pub(crate) async fn solo_delete(
    let k = [&u32_to_ivec(claim.uid), &sid_ivec].concat();
    db.open_tree("user_solos")?.remove(k)?;

+    db.open_tree("tan")?.remove(format!("solo{}", sid))?;
+
    let target = format!("/solo/user/{}", solo.uid);
    Ok(Redirect::to(&target))
 }
--- a/src/controller/tantity.rs
+++ b/src/controller/tantity.rs
@@ -1,117 +0,0 @@
-use std::collections::HashSet;
-
-use jieba_rs::{Jieba, TokenizeMode};
-use once_cell::sync::Lazy;
-use rust_stemmers::{Algorithm, Stemmer};
-use tantivy::tokenizer::{
-    BoxTokenStream, RemoveLongFilter, TextAnalyzer, Token, TokenStream, Tokenizer,
-};
-use unicode_segmentation::UnicodeSegmentation;
-use whichlang::detect_language;
-
-const MULTI_LINGO_TOKENIZER: &str = "multi_lingo_tokenizer";
-
-fn get_tokenizer() -> TextAnalyzer {
-    TextAnalyzer::from(MultiLingoTokenizer).filter(RemoveLongFilter::limit(30))
-}
-
-#[derive(Clone)]
-struct MultiLingoTokenizer;
-
-impl Tokenizer for MultiLingoTokenizer {
-    fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
-        if text.is_empty() {
-            return BoxTokenStream::from(MultiLingoTokenStream {
-                tokens: vec![],
-                index: 0,
-            });
-        }
-
-        let tokens = pre_tokenize_text(text);
-        BoxTokenStream::from(MultiLingoTokenStream { tokens, index: 0 })
-    }
-}
-
-struct MultiLingoTokenStream {
-    tokens: Vec<Token>,
-    index: usize,
-}
-
-impl TokenStream for MultiLingoTokenStream {
-    fn advance(&mut self) -> bool {
-        if self.index < self.tokens.len() {
-            self.index += 1;
-            true
-        } else {
-            false
-        }
-    }
-
-    fn token(&self) -> &Token {
-        &self.tokens[self.index - 1]
-    }
-
-    fn token_mut(&mut self) -> &mut Token {
-        &mut self.tokens[self.index - 1]
-    }
-}
-
-static JIEBA: Lazy<Jieba> = Lazy::new(Jieba::new);
-static STEMMER_ENG: Lazy<Stemmer> = Lazy::new(|| Stemmer::create(Algorithm::English));
-
-fn pre_tokenize_text(text: &str) -> Vec<Token> {
-    let mut tokens = Vec::new();
-    match detect_language(text) {
-        whichlang::Lang::Eng => {
-            for (idx, (offset, word)) in text.unicode_word_indices().enumerate() {
-                let word = word.to_lowercase();
-                if !STOP_WORDS_ENG.contains(&word) {
-                    tokens.push(Token {
-                        offset_from: offset,
-                        offset_to: offset + word.len(),
-                        position: idx,
-                        text: STEMMER_ENG.stem(&word).to_string(),
-                        position_length: 1,
-                    });
-                }
-            }
-        }
-
-        whichlang::Lang::Cmn => {
-            let text = fast2s::convert(text);
-            let orig_tokens = JIEBA.tokenize(&text, TokenizeMode::Search, true);
-            let mut indices = text.char_indices().collect::<Vec<_>>();
-            indices.push((text.len(), '\0'));
-
-            for token in orig_tokens {
-                if !STOP_WORDS_CMN.contains(token.word) {
-                    tokens.push(Token {
-                        offset_from: indices[token.start].0,
-                        offset_to: indices[token.end].0,
-                        position: token.start,
-                        text: token.word.to_lowercase(),
-                        position_length: 1,
-                    });
-                }
-            }
-        }
-
-        _ => todo!(),
-    }
-
-    tokens
-}
-
-static STOP_WORDS_ENG: Lazy<HashSet<String>> = Lazy::new(|| {
-    stop_words::get(stop_words::LANGUAGE::English)
-        .into_iter()
-        .collect()
-});
-
-static STOP_WORDS_CMN: Lazy<HashSet<String>> = Lazy::new(|| {
-    let mut set: HashSet<_> = stop_words::get(stop_words::LANGUAGE::Chinese)
-        .into_iter()
-        .collect();
-    set.insert(" ".to_string());
-    set
-});
--- a/src/controller/tantivy.rs
+++ b/src/controller/tantivy.rs
@@ -0,0 +1,284 @@
+use std::collections::HashSet;
+
+use bincode::config::standard;
+use jieba_rs::{Jieba, TokenizeMode};
+use once_cell::sync::Lazy;
+use rust_stemmers::{Algorithm, Stemmer};
+use sled::Db;
+use tantivy::{
+    directory::MmapDirectory,
+    query::QueryParser,
+    schema::{
+        Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST,
+        INDEXED, STORED, STRING,
+    },
+    tokenizer::{BoxTokenStream, Token, TokenStream, Tokenizer},
+    Document, Index, IndexReader, IndexWriter, Term,
+};
+use unicode_segmentation::UnicodeSegmentation;
+use whichlang::detect_language;
+
+use crate::{config::CONFIG, error::AppError};
+
+use super::{
+    db_utils::{get_one, u32_to_ivec},
+    Comment, Item, Post, Solo,
+};
+
+pub(super) trait ToDoc {
+    fn to_doc(&self, id: Option<u32>) -> Document;
+}
+
+pub(super) static SEARCHER: Lazy<Searcher> = Lazy::new(|| Tan::get_searcher().unwrap());
+pub(super) static FIELDS: Lazy<Fields> = Lazy::new(|| Tan::set_schema().1);
+
+pub struct Tan {
+    writer: IndexWriter,
+}
+
+pub(super) struct Searcher {
+    pub(super) reader: IndexReader,
+    pub(super) query_parser: QueryParser,
+    pub(super) schema: Schema,
+}
+
+pub(super) struct Fields {
+    pub(super) id: Field,
+    pub(super) title: Field,
+    pub(super) time: Field,
+    pub(super) uid: Field,
+    pub(super) content: Field,
+    pub(super) content_type: Field,
+}
+
+impl Tan {
+    fn set_schema() -> (Schema, Fields) {
+        let mut schema_builder = SchemaBuilder::default();
+
+        let text_indexing = TextFieldIndexing::default()
+            .set_tokenizer(MULTI_LINGO_TOKENIZER)
+            .set_index_option(IndexRecordOption::WithFreqsAndPositions);
+        let text_options_stored = TextOptions::default()
+            .set_indexing_options(text_indexing.clone())
+            .set_stored();
+        let text_options_nostored = TextOptions::default().set_indexing_options(text_indexing);
+
+        let id = schema_builder.add_text_field("id", STRING | STORED);
+        let title = schema_builder.add_text_field("title", text_options_stored);
+        let time = schema_builder.add_text_field("time", STORED);
+        let uid = schema_builder.add_u64_field("uid", STORED | INDEXED);
+        let content = schema_builder.add_text_field("content", text_options_nostored);
+        let content_type = schema_builder.add_text_field("content_type", FAST | STRING);
+
+        let fields = Fields {
+            id,
+            title,
+            time,
+            uid,
+            content,
+            content_type,
+        };
+        let schema = schema_builder.build();
+
+        (schema, fields)
+    }
+
+    fn get_index() -> tantivy::Result<Index> {
+        let (schema, _) = Tan::set_schema();
+        let index = tantivy::Index::open_or_create(
+            MmapDirectory::open(&CONFIG.tantivy_path).unwrap(),
+            schema,
+        )?;
+        let tokenizer = MultiLingoTokenizer {};
+        index
+            .tokenizers()
+            .register(MULTI_LINGO_TOKENIZER, tokenizer);
+        Ok(index)
+    }
+
+    pub fn init() -> tantivy::Result<Self> {
+        let index = Tan::get_index()?;
+        let writer = index.writer(50 * 1024 * 1024)?;
+        Ok(Tan { writer })
+    }
+
+    fn get_searcher() -> tantivy::Result<Searcher> {
+        let (schema, _) = Tan::set_schema();
+        let index = Tan::get_index()?;
+        let reader = index.reader().unwrap();
+        let mut query_parser = QueryParser::for_index(&index, vec![FIELDS.title, FIELDS.content]);
+        query_parser.set_conjunction_by_default();
+        query_parser.set_field_boost(FIELDS.title, 2.);
+
+        Ok(Searcher {
+            reader,
+            query_parser,
+            schema,
+        })
+    }
+
+    /// id should be `post123` `comt45#1` `solo123` or `item123`
+    ///
+    /// It just add doc to tantivy, not commit.
+    pub fn add_doc(&mut self, id: String, db: Db) -> Result<(), AppError> {
+        let content_type = &id[0..4];
+        let ids: Vec<_> = id[4..].split("#").collect();
+        let id1: u32 = ids[0].parse().unwrap();
+        let doc = match content_type {
+            "post" => {
+                let post: Post = get_one(&db, "posts", id1)?;
+                post.to_doc(None)
+            }
+            "comt" => {
+                let id2: u32 = ids[1].parse().unwrap();
+                let k = [&u32_to_ivec(id1), &u32_to_ivec(id2)].concat();
+                let v = db
+                    .open_tree("post_comments")?
+                    .get(k)?
+                    .ok_or(AppError::NotFound)?;
+                let (comment, _): (Comment, usize) = bincode::decode_from_slice(&v, standard())?;
+                comment.to_doc(None)
+            }
+            "solo" => {
+                let solo: Solo = get_one(&db, "solos", id1)?;
+                solo.to_doc(None)
+            }
+            "item" => {
+                let item: Item = get_one(&db, "items", id1)?;
+                item.to_doc(Some(id1))
+            }
+            _ => unreachable!(),
+        };
+
+        self.writer.add_document(doc)?;
+
+        Ok(())
+    }
+
+    pub fn del_index(&mut self, id: &str) -> tantivy::Result<()> {
+        self.writer
+            .delete_term(Term::from_field_text(FIELDS.id, id));
+        Ok(())
+    }
+
+    pub fn commit(&mut self) -> tantivy::Result<()> {
+        self.writer.commit()?;
+        Ok(())
+    }
+}
+
+const MULTI_LINGO_TOKENIZER: &str = "multi_lingo_tokenizer";
+
+#[derive(Clone)]
+struct MultiLingoTokenizer;
+
+impl Tokenizer for MultiLingoTokenizer {
+    fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
+        if text.is_empty() {
+            return BoxTokenStream::from(MultiLingoTokenStream {
+                tokens: vec![],
+                index: 0,
+            });
+        }
+
+        let tokens = pre_tokenize_text(text);
+        BoxTokenStream::from(MultiLingoTokenStream { tokens, index: 0 })
+    }
+}
+
+struct MultiLingoTokenStream {
+    tokens: Vec<Token>,
+    index: usize,
+}
+
+impl TokenStream for MultiLingoTokenStream {
+    fn advance(&mut self) -> bool {
+        if self.index < self.tokens.len() {
+            self.index += 1;
+            true
+        } else {
+            false
+        }
+    }
+
+    fn token(&self) -> &Token {
+        &self.tokens[self.index - 1]
+    }
+
+    fn token_mut(&mut self) -> &mut Token {
+        &mut self.tokens[self.index - 1]
+    }
+}
+
+static JIEBA: Lazy<Jieba> = Lazy::new(Jieba::new);
+static STEMMER_ENG: Lazy<Stemmer> = Lazy::new(|| Stemmer::create(Algorithm::English));
+
+fn pre_tokenize_text(text: &str) -> Vec<Token> {
+    let mut tokens = Vec::with_capacity(text.len() / 4);
+    match detect_language(text) {
+        whichlang::Lang::Eng => {
+            for (idx, (offset, word)) in text.unicode_word_indices().enumerate() {
+                let word = word.to_lowercase();
+                if !STOP_WORDS_ENG.contains(&word) && word.len() <= 30 {
+                    tokens.push(Token {
+                        offset_from: offset,
+                        offset_to: offset + word.len(),
+                        position: idx,
+                        text: STEMMER_ENG.stem(&word).to_string(),
+                        position_length: 1,
+                    });
+                }
+            }
+        }
+
+        whichlang::Lang::Cmn => {
+            let text = fast2s::convert(text);
+            let orig_tokens = JIEBA.tokenize(&text, TokenizeMode::Search, true);
+            let mut indices = text.char_indices().collect::<Vec<_>>();
+            indices.push((text.len(), '\0'));
+
+            for token in orig_tokens {
+                if !STOP_WORDS_CMN.contains(token.word) && token.word.len() <= 30 {
+                    tokens.push(Token {
+                        offset_from: indices[token.start].0,
+                        offset_to: indices[token.end].0,
+                        position: token.start,
+                        text: token.word.to_lowercase(),
+                        position_length: 1,
+                    });
+                }
+            }
+        }
+
+        _ => {
+            for (idx, (offset, word)) in text.unicode_word_indices().enumerate() {
+                let word = word.to_lowercase();
+                if word.len() <= 30 {
+                    tokens.push(Token {
+                        offset_from: offset,
+                        offset_to: offset + word.len(),
+                        position: idx,
+                        text: word,
+                        position_length: 1,
+                    });
+                }
+            }
+        }
+    }
+
+    tokens
+}
+
+static STOP_WORDS_ENG: Lazy<HashSet<String>> = Lazy::new(|| {
+    stop_words::get(stop_words::LANGUAGE::English)
+        .into_iter()
+        .collect()
+});
+
+static STOP_WORDS_CMN: Lazy<HashSet<String>> = Lazy::new(|| {
+    let mut set: HashSet<_> = stop_words::get(stop_words::LANGUAGE::Chinese)
+        .into_iter()
+        .collect();
+    set.insert(" ".to_string());
+    set
+});
--- a/src/error.rs
+++ b/src/error.rs
@@ -15,6 +15,8 @@ pub enum AppError {
    IoError(#[from] std::io::Error),
    #[error("You must join inn first")]
    NoJoinedInn,
+    #[error(transparent)]
+    TantivyError(#[from] tantivy::TantivyError),

    // 4XX
    #[error("Captcha Error")]
--- a/src/main.rs
+++ b/src/main.rs
@@ -4,7 +4,9 @@ use chrono::Utc;
 use freedit::{
    app_router::router,
    config::CONFIG,
-    controller::{db_utils::clear_invalid, feed::cron_feed, meta_handler::shutdown_signal},
+    controller::{
+        db_utils::clear_invalid, feed::cron_feed, meta_handler::shutdown_signal, tantivy::Tan,
+    },
    error::AppError,
    CURRENT_SHA256, GIT_COMMIT, VERSION,
 };
@@ -58,7 +60,40 @@ async fn main() -> Result<(), AppError> {
            if let Err(e) = clear_invalid(&db2, "user_stats").await {
                error!(%e);
            }
-            sleep_seconds(3600 * 8).await;
+            sleep_seconds(3600 * 4).await;
+        }
+    });
+
+    let db2 = db.clone();
+    tokio::spawn(async move {
+        // TODO:
+        // 1. inn feed search
+        // 2. rebuild whole search
+        let mut tan = Tan::init().unwrap();
+        let mut subscriber = db2.open_tree("tan").unwrap().watch_prefix(vec![]);
+        while let Some(event) = (&mut subscriber).await {
+            let db2 = db2.clone();
+            let (k, op_type) = match event {
+                sled::Event::Insert { key, value } => {
+                    if value.len() == 1 {
+                        (key, "update")
+                    } else {
+                        (key, "add")
+                    }
+                }
+                sled::Event::Remove { key } => (key, "delete"),
+            };
+            let id = String::from_utf8_lossy(&k);
+
+            if op_type == "delete" || op_type == "update" {
+                tan.del_index(&id).unwrap();
+            }
+
+            if op_type == "update" || op_type == "add" {
+                tan.add_doc(id.into(), db2).unwrap();
+            }
+
+            tan.commit().unwrap();
        }
    });