package main import ( "context" "io" "strings" "github.com/yuin/goldmark" "github.com/yuin/goldmark/ast" "github.com/yuin/goldmark/extension" "github.com/yuin/goldmark/text" "golang.org/x/net/html" "sort" ) type htmlReplacement struct { Start int Stop int Value string } func translateHTMLBlocks(ctx context.Context, translator *PiTranslator, body, srcLang, tgtLang string) (string, error) { source := []byte(body) r := text.NewReader(source) md := goldmark.New( goldmark.WithExtensions(extension.GFM), ) doc := md.Parser().Parse(r) replacements := make([]htmlReplacement, 0, 8) _ = ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) { if !entering { return ast.WalkContinue, nil } block, ok := n.(*ast.HTMLBlock) if !ok { return ast.WalkContinue, nil } start, stop, ok := htmlBlockSpan(block, source) if !ok { return ast.WalkSkipChildren, nil } htmlText := string(source[start:stop]) translated, err := translateHTMLBlock(ctx, translator, htmlText, srcLang, tgtLang) if err != nil { return ast.WalkStop, err } replacements = append(replacements, htmlReplacement{Start: start, Stop: stop, Value: translated}) return ast.WalkSkipChildren, nil }) if len(replacements) == 0 { return body, nil } return applyHTMLReplacements(body, replacements), nil } func htmlBlockSpan(block *ast.HTMLBlock, source []byte) (int, int, bool) { lines := block.Lines() if lines.Len() == 0 { return 0, 0, false } start := lines.At(0).Start stop := lines.At(lines.Len() - 1).Stop if start >= stop { return 0, 0, false } return start, stop, true } func applyHTMLReplacements(body string, replacements []htmlReplacement) string { if len(replacements) == 0 { return body } sortHTMLReplacements(replacements) var out strings.Builder last := 0 for _, rep := range replacements { if rep.Start < last { continue } out.WriteString(body[last:rep.Start]) out.WriteString(rep.Value) last = rep.Stop } out.WriteString(body[last:]) return out.String() } func sortHTMLReplacements(replacements []htmlReplacement) { sort.Slice(replacements, func(i, j int) bool { return replacements[i].Start < replacements[j].Start }) } func translateHTMLBlock(ctx context.Context, translator *PiTranslator, htmlText, srcLang, tgtLang string) (string, error) { tokenizer := html.NewTokenizer(strings.NewReader(htmlText)) var out strings.Builder skipDepth := 0 for { tt := tokenizer.Next() if tt == html.ErrorToken { if err := tokenizer.Err(); err != nil && err != io.EOF { return "", err } break } raw := string(tokenizer.Raw()) tok := tokenizer.Token() switch tt { case html.StartTagToken: out.WriteString(raw) if isSkipTag(strings.ToLower(tok.Data)) { skipDepth++ } case html.EndTagToken: out.WriteString(raw) if isSkipTag(strings.ToLower(tok.Data)) && skipDepth > 0 { skipDepth-- } case html.SelfClosingTagToken: out.WriteString(raw) case html.TextToken: if shouldTranslateHTMLText(skipDepth, raw) { translated, err := translator.Translate(ctx, raw, srcLang, tgtLang) if err != nil { return "", err } out.WriteString(translated) } else { out.WriteString(raw) } default: out.WriteString(raw) } } return out.String(), nil } func shouldTranslateHTMLText(skipDepth int, text string) bool { if strings.TrimSpace(text) == "" { return false } return skipDepth == 0 } func isSkipTag(tag string) bool { switch tag { case "code", "pre", "script", "style": return true default: return false } }