mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 19:31:00 +00:00
docs-i18n: chunk raw doc translation (#62969)
Merged via squash.
Prepared head SHA: 6a16d66486
Co-authored-by: hxy91819 <8814856+hxy91819@users.noreply.github.com>
Co-authored-by: hxy91819 <8814856+hxy91819@users.noreply.github.com>
Reviewed-by: @hxy91819
This commit is contained in:
823
scripts/docs-i18n/doc_chunked_raw.go
Normal file
823
scripts/docs-i18n/doc_chunked_raw.go
Normal file
@@ -0,0 +1,823 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const defaultDocChunkMaxBytes = 12000
|
||||
const defaultDocChunkPromptBudget = 15000
|
||||
|
||||
var (
|
||||
docsFenceRE = regexp.MustCompile(`^\s*(` + "```" + `|~~~)`)
|
||||
docsComponentTagRE = regexp.MustCompile(`<(/?)([A-Z][A-Za-z0-9]*)\b[^>]*?/?>`)
|
||||
)
|
||||
|
||||
var docsProtocolTokens = []string{
|
||||
frontmatterTagStart,
|
||||
frontmatterTagEnd,
|
||||
bodyTagStart,
|
||||
bodyTagEnd,
|
||||
"[[[FM_",
|
||||
}
|
||||
|
||||
type docChunkStructure struct {
|
||||
fenceCount int
|
||||
tagCounts map[string]int
|
||||
}
|
||||
|
||||
type docChunkSplitPlan struct {
|
||||
groups [][]string
|
||||
reason string
|
||||
}
|
||||
|
||||
func translateDocBodyChunked(ctx context.Context, translator docsTranslator, relPath, body, srcLang, tgtLang string) (string, error) {
|
||||
if strings.TrimSpace(body) == "" {
|
||||
return body, nil
|
||||
}
|
||||
blocks := splitDocBodyIntoBlocks(body)
|
||||
groups := groupDocBlocks(blocks, docsI18nDocChunkMaxBytes())
|
||||
logDocChunkPlan(relPath, blocks, groups)
|
||||
out := strings.Builder{}
|
||||
for index, group := range groups {
|
||||
chunkID := fmt.Sprintf("%s.chunk-%03d", relPath, index+1)
|
||||
translated, err := translateDocBlockGroup(ctx, translator, chunkID, group, srcLang, tgtLang)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
out.WriteString(translated)
|
||||
}
|
||||
return out.String(), nil
|
||||
}
|
||||
|
||||
func translateDocBlockGroup(ctx context.Context, translator docsTranslator, chunkID string, blocks []string, srcLang, tgtLang string) (string, error) {
|
||||
source := strings.Join(blocks, "")
|
||||
if strings.TrimSpace(source) == "" {
|
||||
return source, nil
|
||||
}
|
||||
if plan, ok := planDocChunkSplit(blocks, docsI18nDocChunkMaxBytes(), docsI18nDocChunkPromptBudget()); ok {
|
||||
logDocChunkPlanSplit(chunkID, plan, source)
|
||||
return translatePlannedDocChunkGroups(ctx, translator, chunkID, plan.groups, srcLang, tgtLang)
|
||||
}
|
||||
normalizedSource, commonIndent := stripCommonIndent(source)
|
||||
log.Printf("docs-i18n: chunk start %s blocks=%d bytes=%d", chunkID, len(blocks), len(source))
|
||||
translated, err := translator.TranslateRaw(ctx, normalizedSource, srcLang, tgtLang)
|
||||
if err == nil {
|
||||
translated = sanitizeDocChunkProtocolWrappers(source, translated)
|
||||
translated = reapplyCommonIndent(translated, commonIndent)
|
||||
if validationErr := validateDocChunkTranslation(source, translated); validationErr == nil {
|
||||
log.Printf("docs-i18n: chunk done %s out_bytes=%d", chunkID, len(translated))
|
||||
return translated, nil
|
||||
} else {
|
||||
err = validationErr
|
||||
}
|
||||
}
|
||||
if len(blocks) <= 1 {
|
||||
if fallback, fallbackErr := translateDocLeafBlock(ctx, translator, chunkID, source, srcLang, tgtLang); fallbackErr == nil {
|
||||
return fallback, nil
|
||||
}
|
||||
if plan, ok := planSingletonDocChunkRetry(source, docsI18nDocChunkMaxBytes(), docsI18nDocChunkPromptBudget()); ok {
|
||||
logDocChunkPlanSplit(chunkID, plan, source)
|
||||
return translatePlannedDocChunkGroups(ctx, translator, chunkID, plan.groups, srcLang, tgtLang)
|
||||
}
|
||||
return "", fmt.Errorf("%s: %w", chunkID, err)
|
||||
}
|
||||
if plan, ok := planDocChunkSplit(blocks, docsI18nDocChunkMaxBytes(), docsI18nDocChunkPromptBudget()); ok {
|
||||
logDocChunkSplit(chunkID, len(blocks), err)
|
||||
return translatePlannedDocChunkGroups(ctx, translator, chunkID, plan.groups, srcLang, tgtLang)
|
||||
}
|
||||
if plan, ok := splitDocChunkBlocksMidpointSimple(blocks); ok {
|
||||
logDocChunkSplit(chunkID, len(blocks), err)
|
||||
return translatePlannedDocChunkGroups(ctx, translator, chunkID, plan.groups, srcLang, tgtLang)
|
||||
}
|
||||
return "", fmt.Errorf("%s: %w", chunkID, err)
|
||||
}
|
||||
|
||||
func translateDocLeafBlock(ctx context.Context, translator docsTranslator, chunkID, source, srcLang, tgtLang string) (string, error) {
|
||||
sourceStructure := summarizeDocChunkStructure(source)
|
||||
if sourceStructure.fenceCount != 0 {
|
||||
return "", fmt.Errorf("%s: raw leaf fallback not applicable", chunkID)
|
||||
}
|
||||
normalizedSource, commonIndent := stripCommonIndent(source)
|
||||
maskedSource, placeholders := maskDocComponentTags(normalizedSource)
|
||||
translated, err := translator.Translate(ctx, maskedSource, srcLang, tgtLang)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
translated, err = restoreDocComponentTags(translated, placeholders)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
translated = sanitizeDocChunkProtocolWrappers(source, translated)
|
||||
translated = reapplyCommonIndent(translated, commonIndent)
|
||||
if validationErr := validateDocChunkTranslation(source, translated); validationErr != nil {
|
||||
return "", validationErr
|
||||
}
|
||||
log.Printf("docs-i18n: chunk leaf-fallback done %s out_bytes=%d", chunkID, len(translated))
|
||||
return translated, nil
|
||||
}
|
||||
|
||||
func splitDocBodyIntoBlocks(body string) []string {
|
||||
if body == "" {
|
||||
return nil
|
||||
}
|
||||
lines := strings.SplitAfter(body, "\n")
|
||||
blocks := make([]string, 0, len(lines))
|
||||
var current strings.Builder
|
||||
fenceDelimiter := ""
|
||||
for _, line := range lines {
|
||||
current.WriteString(line)
|
||||
fenceDelimiter, _ = updateFenceDelimiter(fenceDelimiter, line)
|
||||
inFence := fenceDelimiter != ""
|
||||
if !inFence && strings.TrimSpace(line) == "" {
|
||||
blocks = append(blocks, current.String())
|
||||
current.Reset()
|
||||
}
|
||||
}
|
||||
if current.Len() > 0 {
|
||||
blocks = append(blocks, current.String())
|
||||
}
|
||||
if len(blocks) == 0 {
|
||||
return []string{body}
|
||||
}
|
||||
return blocks
|
||||
}
|
||||
|
||||
func groupDocBlocks(blocks []string, maxBytes int) [][]string {
|
||||
if len(blocks) == 0 {
|
||||
return nil
|
||||
}
|
||||
if maxBytes <= 0 {
|
||||
maxBytes = defaultDocChunkMaxBytes
|
||||
}
|
||||
groups := make([][]string, 0, len(blocks))
|
||||
current := make([]string, 0, 8)
|
||||
currentBytes := 0
|
||||
flush := func() {
|
||||
if len(current) == 0 {
|
||||
return
|
||||
}
|
||||
groups = append(groups, current)
|
||||
current = make([]string, 0, 8)
|
||||
currentBytes = 0
|
||||
}
|
||||
for _, block := range blocks {
|
||||
blockBytes := len(block)
|
||||
if len(current) > 0 && currentBytes+blockBytes > maxBytes {
|
||||
flush()
|
||||
}
|
||||
if blockBytes > maxBytes {
|
||||
groups = append(groups, []string{block})
|
||||
continue
|
||||
}
|
||||
current = append(current, block)
|
||||
currentBytes += blockBytes
|
||||
}
|
||||
flush()
|
||||
return groups
|
||||
}
|
||||
|
||||
func validateDocChunkTranslation(source, translated string) error {
|
||||
if hasUnexpectedTopLevelProtocolWrapper(source, translated) {
|
||||
return fmt.Errorf("protocol token leaked: top-level wrapper")
|
||||
}
|
||||
sourceLower := strings.ToLower(source)
|
||||
translatedLower := strings.ToLower(translated)
|
||||
for _, token := range docsProtocolTokens {
|
||||
tokenLower := strings.ToLower(token)
|
||||
if strings.Contains(sourceLower, tokenLower) {
|
||||
continue
|
||||
}
|
||||
if strings.Contains(translatedLower, tokenLower) {
|
||||
return fmt.Errorf("protocol token leaked: %s", token)
|
||||
}
|
||||
}
|
||||
sourceStructure := summarizeDocChunkStructure(source)
|
||||
translatedStructure := summarizeDocChunkStructure(translated)
|
||||
if sourceStructure.fenceCount != translatedStructure.fenceCount {
|
||||
return fmt.Errorf("code fence mismatch: source=%d translated=%d", sourceStructure.fenceCount, translatedStructure.fenceCount)
|
||||
}
|
||||
if !slices.Equal(sortedKeys(sourceStructure.tagCounts), sortedKeys(translatedStructure.tagCounts)) {
|
||||
return fmt.Errorf("component tag set mismatch")
|
||||
}
|
||||
for _, key := range sortedKeys(sourceStructure.tagCounts) {
|
||||
if sourceStructure.tagCounts[key] != translatedStructure.tagCounts[key] {
|
||||
return fmt.Errorf("component tag mismatch for %s: source=%d translated=%d", key, sourceStructure.tagCounts[key], translatedStructure.tagCounts[key])
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func sanitizeDocChunkProtocolWrappers(source, translated string) string {
|
||||
if !containsProtocolWrapperToken(translated) {
|
||||
return translated
|
||||
}
|
||||
trimmedTranslated := strings.TrimSpace(translated)
|
||||
if !hasUnexpectedTopLevelProtocolWrapper(source, trimmedTranslated) {
|
||||
return translated
|
||||
}
|
||||
if !hasAmbiguousTaggedBodyClose(source, trimmedTranslated) {
|
||||
_, body, err := parseTaggedDocument(trimmedTranslated)
|
||||
if err == nil {
|
||||
if strings.TrimSpace(body) == "" {
|
||||
return translated
|
||||
}
|
||||
return body
|
||||
}
|
||||
}
|
||||
body, ok := stripBodyOnlyWrapper(trimmedTranslated)
|
||||
if !ok || strings.TrimSpace(body) == "" {
|
||||
return translated
|
||||
}
|
||||
return body
|
||||
}
|
||||
|
||||
func stripBodyOnlyWrapper(text string) (string, bool) {
|
||||
lower := strings.ToLower(text)
|
||||
bodyStartLower := strings.ToLower(bodyTagStart)
|
||||
bodyEndLower := strings.ToLower(bodyTagEnd)
|
||||
if !strings.HasPrefix(lower, bodyStartLower) || !strings.HasSuffix(lower, bodyEndLower) {
|
||||
return "", false
|
||||
}
|
||||
body := text[len(bodyTagStart) : len(text)-len(bodyTagEnd)]
|
||||
bodyLower := lower[len(bodyTagStart) : len(lower)-len(bodyTagEnd)]
|
||||
if strings.Contains(bodyLower, bodyStartLower) || strings.Contains(bodyLower, bodyEndLower) {
|
||||
return "", false
|
||||
}
|
||||
return trimTagNewlines(body), true
|
||||
}
|
||||
|
||||
func hasAmbiguousTaggedBodyClose(source, translated string) bool {
|
||||
sourceLower := strings.ToLower(source)
|
||||
if !strings.Contains(sourceLower, strings.ToLower(bodyTagStart)) && !strings.Contains(sourceLower, strings.ToLower(bodyTagEnd)) {
|
||||
return false
|
||||
}
|
||||
translatedLower := strings.ToLower(translated)
|
||||
if !strings.Contains(translatedLower, strings.ToLower(frontmatterTagStart)) {
|
||||
return false
|
||||
}
|
||||
return strings.Count(translatedLower, strings.ToLower(bodyTagEnd)) == 1
|
||||
}
|
||||
|
||||
func maskDocComponentTags(text string) (string, []string) {
|
||||
placeholders := make([]string, 0, 4)
|
||||
masked := docsComponentTagRE.ReplaceAllStringFunc(text, func(match string) string {
|
||||
placeholder := fmt.Sprintf("__OC_DOC_TAG_%03d__", len(placeholders))
|
||||
placeholders = append(placeholders, match)
|
||||
return placeholder
|
||||
})
|
||||
return masked, placeholders
|
||||
}
|
||||
|
||||
func restoreDocComponentTags(text string, placeholders []string) (string, error) {
|
||||
restored := text
|
||||
for index, original := range placeholders {
|
||||
placeholder := fmt.Sprintf("__OC_DOC_TAG_%03d__", index)
|
||||
if !strings.Contains(restored, placeholder) {
|
||||
return "", fmt.Errorf("component tag placeholder missing: %s", placeholder)
|
||||
}
|
||||
restored = strings.ReplaceAll(restored, placeholder, original)
|
||||
}
|
||||
return restored, nil
|
||||
}
|
||||
|
||||
func logDocChunkSplit(chunkID string, blockCount int, err error) {
|
||||
if docsI18nVerboseLogs() || blockCount >= 16 {
|
||||
log.Printf("docs-i18n: chunk split %s blocks=%d err=%v", chunkID, blockCount, err)
|
||||
}
|
||||
}
|
||||
|
||||
func logDocChunkPlanSplit(chunkID string, plan docChunkSplitPlan, source string) {
|
||||
if plan.reason == "" {
|
||||
plan.reason = "unknown"
|
||||
}
|
||||
log.Printf("docs-i18n: chunk pre-split %s reason=%s groups=%d bytes=%d", chunkID, plan.reason, len(plan.groups), len(source))
|
||||
}
|
||||
|
||||
func summarizeDocChunkStructure(text string) docChunkStructure {
|
||||
counts := map[string]int{}
|
||||
lines := strings.Split(text, "\n")
|
||||
fenceDelimiter := ""
|
||||
for _, line := range lines {
|
||||
var toggled bool
|
||||
fenceDelimiter, toggled = updateFenceDelimiter(fenceDelimiter, line)
|
||||
if toggled {
|
||||
counts["__fence_toggle__"]++
|
||||
}
|
||||
for _, match := range docsComponentTagRE.FindAllStringSubmatch(line, -1) {
|
||||
if len(match) < 3 {
|
||||
continue
|
||||
}
|
||||
fullToken := match[0]
|
||||
tagName := match[2]
|
||||
direction := "open"
|
||||
if match[1] == "/" {
|
||||
direction = "close"
|
||||
}
|
||||
if strings.HasSuffix(fullToken, "/>") {
|
||||
direction = "self"
|
||||
}
|
||||
counts[tagName+":"+direction]++
|
||||
}
|
||||
}
|
||||
return docChunkStructure{
|
||||
fenceCount: counts["__fence_toggle__"],
|
||||
tagCounts: countsWithoutFence(counts),
|
||||
}
|
||||
}
|
||||
|
||||
func countsWithoutFence(counts map[string]int) map[string]int {
|
||||
filtered := map[string]int{}
|
||||
for key, value := range counts {
|
||||
if key == "__fence_toggle__" {
|
||||
continue
|
||||
}
|
||||
filtered[key] = value
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
|
||||
func sortedKeys(counts map[string]int) []string {
|
||||
keys := make([]string, 0, len(counts))
|
||||
for key := range counts {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
slices.Sort(keys)
|
||||
return keys
|
||||
}
|
||||
|
||||
func updateFenceDelimiter(current, line string) (string, bool) {
|
||||
delimiter := leadingFenceDelimiter(line)
|
||||
if delimiter == "" {
|
||||
return current, false
|
||||
}
|
||||
if current == "" {
|
||||
return delimiter, true
|
||||
}
|
||||
if delimiter[0] == current[0] && len(delimiter) >= len(current) && isClosingFenceLine(line, delimiter) {
|
||||
return "", true
|
||||
}
|
||||
return current, false
|
||||
}
|
||||
|
||||
func leadingFenceDelimiter(line string) string {
|
||||
trimmed := strings.TrimLeft(line, " \t")
|
||||
if len(trimmed) < 3 {
|
||||
return ""
|
||||
}
|
||||
switch trimmed[0] {
|
||||
case '`', '~':
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
marker := trimmed[0]
|
||||
index := 0
|
||||
for index < len(trimmed) && trimmed[index] == marker {
|
||||
index++
|
||||
}
|
||||
if index < 3 {
|
||||
return ""
|
||||
}
|
||||
return trimmed[:index]
|
||||
}
|
||||
|
||||
func isClosingFenceLine(line, delimiter string) bool {
|
||||
trimmed := strings.TrimLeft(line, " \t")
|
||||
if !strings.HasPrefix(trimmed, delimiter) {
|
||||
return false
|
||||
}
|
||||
return strings.TrimSpace(trimmed[len(delimiter):]) == ""
|
||||
}
|
||||
|
||||
func hasUnexpectedTopLevelProtocolWrapper(source, translated string) bool {
|
||||
sourceTrimmed := strings.ToLower(strings.TrimSpace(source))
|
||||
translatedTrimmed := strings.ToLower(strings.TrimSpace(translated))
|
||||
checks := []struct {
|
||||
token string
|
||||
match func(string) bool
|
||||
}{
|
||||
{token: frontmatterTagStart, match: func(text string) bool { return strings.HasPrefix(text, strings.ToLower(frontmatterTagStart)) }},
|
||||
{token: bodyTagStart, match: func(text string) bool { return strings.HasPrefix(text, strings.ToLower(bodyTagStart)) }},
|
||||
{token: frontmatterTagEnd, match: func(text string) bool { return strings.HasSuffix(text, strings.ToLower(frontmatterTagEnd)) }},
|
||||
{token: bodyTagEnd, match: func(text string) bool { return strings.HasSuffix(text, strings.ToLower(bodyTagEnd)) }},
|
||||
}
|
||||
for _, check := range checks {
|
||||
if check.match(translatedTrimmed) && !check.match(sourceTrimmed) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func containsProtocolWrapperToken(text string) bool {
|
||||
lower := strings.ToLower(text)
|
||||
return strings.Contains(lower, strings.ToLower(bodyTagStart)) || strings.Contains(lower, strings.ToLower(frontmatterTagStart))
|
||||
}
|
||||
|
||||
func translatePlannedDocChunkGroups(ctx context.Context, translator docsTranslator, chunkID string, groups [][]string, srcLang, tgtLang string) (string, error) {
|
||||
var out strings.Builder
|
||||
for index, group := range groups {
|
||||
translated, err := translateDocBlockGroup(ctx, translator, fmt.Sprintf("%s.%02d", chunkID, index+1), group, srcLang, tgtLang)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
out.WriteString(translated)
|
||||
}
|
||||
return out.String(), nil
|
||||
}
|
||||
|
||||
func planDocChunkSplit(blocks []string, maxBytes, promptBudget int) (docChunkSplitPlan, bool) {
|
||||
if len(blocks) == 0 {
|
||||
return docChunkSplitPlan{}, false
|
||||
}
|
||||
source := strings.Join(blocks, "")
|
||||
if strings.TrimSpace(source) == "" {
|
||||
return docChunkSplitPlan{}, false
|
||||
}
|
||||
normalizedSource, _ := stripCommonIndent(source)
|
||||
estimatedPromptCost := estimateDocPromptCost(normalizedSource)
|
||||
if len(blocks) > 1 && promptBudget > 0 && estimatedPromptCost > promptBudget {
|
||||
return splitDocChunkBlocksMidpoint(blocks, estimatedPromptCost, promptBudget)
|
||||
}
|
||||
if len(blocks) == 1 {
|
||||
return planSingletonDocChunk(blocks[0], maxBytes, promptBudget)
|
||||
}
|
||||
return docChunkSplitPlan{}, false
|
||||
}
|
||||
|
||||
func splitDocChunkBlocksMidpoint(blocks []string, estimatedPromptCost, promptBudget int) (docChunkSplitPlan, bool) {
|
||||
if len(blocks) <= 1 {
|
||||
return docChunkSplitPlan{}, false
|
||||
}
|
||||
mid := len(blocks) / 2
|
||||
if mid <= 0 || mid >= len(blocks) {
|
||||
return docChunkSplitPlan{}, false
|
||||
}
|
||||
return docChunkSplitPlan{
|
||||
groups: [][]string{blocks[:mid], blocks[mid:]},
|
||||
reason: fmt.Sprintf("prompt-budget:%d>%d", estimatedPromptCost, promptBudget),
|
||||
}, true
|
||||
}
|
||||
|
||||
func splitDocChunkBlocksMidpointSimple(blocks []string) (docChunkSplitPlan, bool) {
|
||||
if len(blocks) <= 1 {
|
||||
return docChunkSplitPlan{}, false
|
||||
}
|
||||
mid := len(blocks) / 2
|
||||
if mid <= 0 || mid >= len(blocks) {
|
||||
return docChunkSplitPlan{}, false
|
||||
}
|
||||
return docChunkSplitPlan{
|
||||
groups: [][]string{blocks[:mid], blocks[mid:]},
|
||||
reason: "retry-midpoint",
|
||||
}, true
|
||||
}
|
||||
|
||||
func planSingletonDocChunk(block string, maxBytes, promptBudget int) (docChunkSplitPlan, bool) {
|
||||
normalizedBlock, _ := stripCommonIndent(block)
|
||||
estimatedPromptCost := estimateDocPromptCost(normalizedBlock)
|
||||
overBytes := maxBytes > 0 && len(block) > maxBytes
|
||||
overPrompt := promptBudget > 0 && estimatedPromptCost > promptBudget
|
||||
if !overBytes && !overPrompt {
|
||||
return docChunkSplitPlan{}, false
|
||||
}
|
||||
|
||||
return planSingletonDocChunkWithMode(block, maxBytes, promptBudget, false)
|
||||
}
|
||||
|
||||
func planSingletonDocChunkRetry(block string, maxBytes, promptBudget int) (docChunkSplitPlan, bool) {
|
||||
return planSingletonDocChunkWithMode(block, maxBytes, promptBudget, true)
|
||||
}
|
||||
|
||||
func planSingletonDocChunkWithMode(block string, maxBytes, promptBudget int, force bool) (docChunkSplitPlan, bool) {
|
||||
if sections := splitDocBlockSections(block); len(sections) > 1 {
|
||||
if groups := wrapDocChunkSections(sections); len(groups) > 1 {
|
||||
reason := "singleton-structural"
|
||||
if force {
|
||||
reason = "singleton-retry-structural"
|
||||
}
|
||||
return docChunkSplitPlan{
|
||||
groups: groups,
|
||||
reason: reason,
|
||||
}, true
|
||||
}
|
||||
}
|
||||
|
||||
if groups, ok := splitPureFencedDocSectionWithMode(block, maxBytes, promptBudget, force); ok {
|
||||
reason := "singleton-fence"
|
||||
if force {
|
||||
reason = "singleton-retry-fence"
|
||||
}
|
||||
return docChunkSplitPlan{
|
||||
groups: groups,
|
||||
reason: reason,
|
||||
}, true
|
||||
}
|
||||
|
||||
if groups, ok := splitPlainDocSectionWithMode(block, maxBytes, promptBudget, force); ok {
|
||||
reason := "singleton-lines"
|
||||
if force {
|
||||
reason = "singleton-retry-lines"
|
||||
}
|
||||
return docChunkSplitPlan{
|
||||
groups: groups,
|
||||
reason: reason,
|
||||
}, true
|
||||
}
|
||||
|
||||
return docChunkSplitPlan{}, false
|
||||
}
|
||||
|
||||
func wrapDocChunkSections(sections []string) [][]string {
|
||||
groups := make([][]string, 0, len(sections))
|
||||
for _, section := range sections {
|
||||
if strings.TrimSpace(section) == "" {
|
||||
continue
|
||||
}
|
||||
groups = append(groups, []string{section})
|
||||
}
|
||||
return groups
|
||||
}
|
||||
|
||||
func splitDocBlockSections(block string) []string {
|
||||
lines := strings.SplitAfter(block, "\n")
|
||||
if len(lines) == 0 {
|
||||
return nil
|
||||
}
|
||||
sections := make([]string, 0, len(lines))
|
||||
var current strings.Builder
|
||||
fenceDelimiter := ""
|
||||
for _, line := range lines {
|
||||
lineDelimiter := leadingFenceDelimiter(line)
|
||||
if fenceDelimiter == "" && lineDelimiter != "" {
|
||||
if current.Len() > 0 {
|
||||
sections = append(sections, current.String())
|
||||
current.Reset()
|
||||
}
|
||||
current.WriteString(line)
|
||||
fenceDelimiter = lineDelimiter
|
||||
continue
|
||||
}
|
||||
|
||||
current.WriteString(line)
|
||||
if fenceDelimiter != "" {
|
||||
if lineDelimiter != "" && lineDelimiter[0] == fenceDelimiter[0] && len(lineDelimiter) >= len(fenceDelimiter) && isClosingFenceLine(line, fenceDelimiter) {
|
||||
sections = append(sections, current.String())
|
||||
current.Reset()
|
||||
fenceDelimiter = ""
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.TrimSpace(line) == "" {
|
||||
sections = append(sections, current.String())
|
||||
current.Reset()
|
||||
}
|
||||
}
|
||||
if current.Len() > 0 {
|
||||
sections = append(sections, current.String())
|
||||
}
|
||||
if len(sections) <= 1 {
|
||||
return nil
|
||||
}
|
||||
return sections
|
||||
}
|
||||
|
||||
func splitPureFencedDocSection(block string, maxBytes, promptBudget int) ([][]string, bool) {
|
||||
return splitPureFencedDocSectionWithMode(block, maxBytes, promptBudget, false)
|
||||
}
|
||||
|
||||
func splitPureFencedDocSectionWithMode(block string, maxBytes, promptBudget int, force bool) ([][]string, bool) {
|
||||
lines := strings.SplitAfter(block, "\n")
|
||||
if len(lines) < 2 {
|
||||
return nil, false
|
||||
}
|
||||
openingIndex := firstNonEmptyLineIndex(lines)
|
||||
closingIndex := lastNonEmptyLineIndex(lines)
|
||||
if openingIndex == -1 || closingIndex <= openingIndex {
|
||||
return nil, false
|
||||
}
|
||||
opening := lines[openingIndex]
|
||||
delimiter := leadingFenceDelimiter(opening)
|
||||
if delimiter == "" || !isClosingFenceLine(lines[closingIndex], delimiter) {
|
||||
return nil, false
|
||||
}
|
||||
prefix := strings.Join(lines[:openingIndex], "")
|
||||
suffix := strings.Join(lines[closingIndex+1:], "")
|
||||
if strings.TrimSpace(prefix) != "" || strings.TrimSpace(suffix) != "" {
|
||||
return nil, false
|
||||
}
|
||||
closing := lines[closingIndex]
|
||||
inner := strings.Join(lines[openingIndex+1:closingIndex], "")
|
||||
groups, ok := splitPlainDocSectionWithMode(inner, maxBytes-len(opening)-len(closing), promptBudget, force)
|
||||
if !ok {
|
||||
return nil, false
|
||||
}
|
||||
for index, group := range groups {
|
||||
joined := strings.Join(group, "")
|
||||
groups[index] = []string{opening + joined + closing}
|
||||
}
|
||||
return groups, true
|
||||
}
|
||||
|
||||
func splitPlainDocSection(text string, maxBytes, promptBudget int) ([][]string, bool) {
|
||||
return splitPlainDocSectionWithMode(text, maxBytes, promptBudget, false)
|
||||
}
|
||||
|
||||
func splitPlainDocSectionWithMode(text string, maxBytes, promptBudget int, force bool) ([][]string, bool) {
|
||||
if maxBytes <= 0 {
|
||||
maxBytes = len(text)
|
||||
}
|
||||
if promptBudget <= 0 {
|
||||
promptBudget = defaultDocChunkPromptBudget
|
||||
}
|
||||
lines := strings.SplitAfter(text, "\n")
|
||||
if len(lines) <= 1 {
|
||||
return nil, false
|
||||
}
|
||||
groups := make([][]string, 0, len(lines))
|
||||
var current strings.Builder
|
||||
currentBytes := 0
|
||||
currentPrompt := 0
|
||||
for _, line := range lines {
|
||||
linePrompt := estimateDocPromptCost(line)
|
||||
if len(line) > maxBytes || linePrompt > promptBudget {
|
||||
return nil, false
|
||||
}
|
||||
if currentBytes > 0 && (currentBytes+len(line) > maxBytes || currentPrompt+linePrompt > promptBudget) {
|
||||
groups = append(groups, []string{current.String()})
|
||||
current.Reset()
|
||||
currentBytes = 0
|
||||
currentPrompt = 0
|
||||
}
|
||||
current.WriteString(line)
|
||||
currentBytes += len(line)
|
||||
currentPrompt += linePrompt
|
||||
}
|
||||
if current.Len() > 0 {
|
||||
groups = append(groups, []string{current.String()})
|
||||
}
|
||||
if len(groups) <= 1 {
|
||||
if !force {
|
||||
return nil, false
|
||||
}
|
||||
return splitPlainDocSectionMidpoint(lines)
|
||||
}
|
||||
return groups, true
|
||||
}
|
||||
|
||||
func splitPlainDocSectionMidpoint(lines []string) ([][]string, bool) {
|
||||
if len(lines) <= 1 {
|
||||
return nil, false
|
||||
}
|
||||
mid := len(lines) / 2
|
||||
if mid <= 0 || mid >= len(lines) {
|
||||
return nil, false
|
||||
}
|
||||
left := strings.Join(lines[:mid], "")
|
||||
right := strings.Join(lines[mid:], "")
|
||||
if strings.TrimSpace(left) == "" || strings.TrimSpace(right) == "" {
|
||||
return nil, false
|
||||
}
|
||||
return [][]string{{left}, {right}}, true
|
||||
}
|
||||
|
||||
func firstNonEmptyLineIndex(lines []string) int {
|
||||
for index, line := range lines {
|
||||
if strings.TrimSpace(line) != "" {
|
||||
return index
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func lastNonEmptyLineIndex(lines []string) int {
|
||||
for index := len(lines) - 1; index >= 0; index-- {
|
||||
if strings.TrimSpace(lines[index]) != "" {
|
||||
return index
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func docsI18nDocChunkMaxBytes() int {
|
||||
value := strings.TrimSpace(os.Getenv("OPENCLAW_DOCS_I18N_DOC_CHUNK_MAX_BYTES"))
|
||||
if value == "" {
|
||||
return defaultDocChunkMaxBytes
|
||||
}
|
||||
parsed, err := strconv.Atoi(value)
|
||||
if err != nil || parsed <= 0 {
|
||||
return defaultDocChunkMaxBytes
|
||||
}
|
||||
return parsed
|
||||
}
|
||||
|
||||
func docsI18nDocChunkPromptBudget() int {
|
||||
value := strings.TrimSpace(os.Getenv("OPENCLAW_DOCS_I18N_DOC_CHUNK_PROMPT_BUDGET"))
|
||||
if value == "" {
|
||||
return defaultDocChunkPromptBudget
|
||||
}
|
||||
parsed, err := strconv.Atoi(value)
|
||||
if err != nil || parsed <= 0 {
|
||||
return defaultDocChunkPromptBudget
|
||||
}
|
||||
return parsed
|
||||
}
|
||||
|
||||
func estimateDocPromptCost(text string) int {
|
||||
cost := len(text)
|
||||
cost += strings.Count(text, "`") * 6
|
||||
cost += strings.Count(text, "|") * 4
|
||||
cost += strings.Count(text, "{") * 4
|
||||
cost += strings.Count(text, "}") * 4
|
||||
cost += strings.Count(text, "[") * 4
|
||||
cost += strings.Count(text, "]") * 4
|
||||
cost += strings.Count(text, ":") * 2
|
||||
cost += strings.Count(text, "<") * 4
|
||||
cost += strings.Count(text, ">") * 4
|
||||
return cost
|
||||
}
|
||||
|
||||
func stripCommonIndent(text string) (string, string) {
|
||||
lines := strings.SplitAfter(text, "\n")
|
||||
common := ""
|
||||
for _, line := range lines {
|
||||
trimmed := strings.TrimRight(line, "\r\n")
|
||||
if strings.TrimSpace(trimmed) == "" {
|
||||
continue
|
||||
}
|
||||
indent := leadingIndent(trimmed)
|
||||
if common == "" {
|
||||
common = indent
|
||||
continue
|
||||
}
|
||||
common = commonIndentPrefix(common, indent)
|
||||
if common == "" {
|
||||
return text, ""
|
||||
}
|
||||
}
|
||||
if common == "" {
|
||||
return text, ""
|
||||
}
|
||||
var out strings.Builder
|
||||
for _, line := range lines {
|
||||
trimmed := strings.TrimRight(line, "\r\n")
|
||||
if strings.TrimSpace(trimmed) == "" {
|
||||
out.WriteString(line)
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(line, common) {
|
||||
out.WriteString(strings.TrimPrefix(line, common))
|
||||
continue
|
||||
}
|
||||
out.WriteString(line)
|
||||
}
|
||||
return out.String(), common
|
||||
}
|
||||
|
||||
func reapplyCommonIndent(text, indent string) string {
|
||||
if indent == "" || text == "" {
|
||||
return text
|
||||
}
|
||||
lines := strings.SplitAfter(text, "\n")
|
||||
var out strings.Builder
|
||||
for _, line := range lines {
|
||||
trimmed := strings.TrimRight(line, "\r\n")
|
||||
if strings.TrimSpace(trimmed) == "" {
|
||||
out.WriteString(line)
|
||||
continue
|
||||
}
|
||||
out.WriteString(indent)
|
||||
out.WriteString(line)
|
||||
}
|
||||
return out.String()
|
||||
}
|
||||
|
||||
func leadingIndent(line string) string {
|
||||
index := 0
|
||||
for index < len(line) {
|
||||
if line[index] != ' ' && line[index] != '\t' {
|
||||
break
|
||||
}
|
||||
index++
|
||||
}
|
||||
return line[:index]
|
||||
}
|
||||
|
||||
func commonIndentPrefix(a, b string) string {
|
||||
limit := len(a)
|
||||
if len(b) < limit {
|
||||
limit = len(b)
|
||||
}
|
||||
index := 0
|
||||
for index < limit && a[index] == b[index] {
|
||||
index++
|
||||
}
|
||||
return a[:index]
|
||||
}
|
||||
Reference in New Issue
Block a user