docs: harden zh-CN translation flow

2026-04-11 01:01:13 +00:00 · 2026-04-05 11:02:32 +01:00
parent 3b84884793
commit f2dc241e9d
3 changed files with 86 additions and 13 deletions
--- a/docs/.i18n/README.md
+++ b/docs/.i18n/README.md
@@ -2,12 +2,38 @@

 This folder stores translation config for the source docs repo.

-Generated zh-CN pages and the live zh-CN translation memory now live in the publish repo (`openclaw/docs`, local sibling checkout `~/Projects/openclaw-docs`).
+Generated zh-CN pages and the live zh-CN translation memory live in the publish repo:

-## Files
+- repo: `openclaw/docs`
+- local checkout: `~/Projects/openclaw-docs`

- `glossary.<lang>.json` — preferred term mappings (used in prompt guidance).
- `<lang>.tm.jsonl` — translation memory (cache) keyed by workflow + model + text hash. In this repo, zh-CN TM is no longer committed.
+## Source of truth
+
+- English docs are authored in `openclaw/openclaw`.
+- The source docs tree lives under `docs/`.
+- The source repo no longer keeps committed `docs/zh-CN/**`.
+
+## End-to-end flow
+
+1. Edit English docs in `openclaw/openclaw`.
+2. Push to `main`.
+3. `openclaw/openclaw/.github/workflows/docs-sync-publish.yml` mirrors the docs tree into `openclaw/docs`.
+4. The sync script rewrites the publish `docs/docs.json` so `zh-Hans` navigation exists there even though it is no longer committed in the source repo.
+5. `openclaw/docs/.github/workflows/translate-zh-cn.yml` refreshes `docs/zh-CN/**` on push and hourly.
+
+## Why the split exists
+
+- Keep generated zh-CN output out of the main product repo.
+- Keep Mintlify on a single published docs tree.
+- Preserve the built-in language switcher by letting the publish repo own `docs/zh-CN/**`.
+
+## Files in this folder
+
+- `glossary.<lang>.json` — preferred term mappings used as prompt guidance.
+- `zh-Hans-navigation.json` — the `zh-Hans` Mintlify nav block reinserted into the publish repo during sync.
+- `<lang>.tm.jsonl` — translation memory keyed by workflow + model + text hash.
+
+In this repo, `docs/.i18n/zh-CN.tm.jsonl` is intentionally no longer committed.

 ## Glossary format

@@ -27,8 +53,18 @@ Fields:
 - `source`: English (or source) phrase to prefer.
 - `target`: preferred translation output.

-## Notes
+## Translation mechanics

- Glossary entries are passed to the model as **prompt guidance** (no deterministic rewrites).
 - `scripts/docs-i18n` still owns translation generation.
- The source repo syncs English docs into the publish repo; zh-CN generation runs there on push and hourly.
+- Doc mode writes `x-i18n.source_hash` into each translated page.
+- The publish workflow precomputes a pending file list by comparing the current English source hash to the stored zh-CN `x-i18n.source_hash`.
+- If the pending count is `0`, the expensive translation step is skipped entirely.
+- If there are pending files, the workflow translates only those files.
+- The publish workflow retries transient model-format failures, but unchanged files stay skipped because the same hash check runs on each retry.
+
+## Operational notes
+
+- Sync metadata is written to `.openclaw-sync/source.json` in the publish repo.
+- Source repo secret: `OPENCLAW_DOCS_SYNC_TOKEN`
+- Publish repo secret: `OPENCLAW_DOCS_I18N_OPENAI_API_KEY`
+- If zh-CN output looks stale, check the `Translate zh-CN` workflow in `openclaw/docs` first.
--- a/scripts/docs-i18n/doc_mode.go
+++ b/scripts/docs-i18n/doc_mode.go
@@ -101,20 +101,26 @@ func parseTaggedDocument(text string) (string, string, error) {
 		return "", "", fmt.Errorf("missing %s", bodyTagStart)
 	}
 	bodyStart += frontEnd + len(bodyTagStart)
-	bodyEnd := strings.Index(text[bodyStart:], bodyTagEnd)
-	if bodyEnd == -1 {
-		return "", "", fmt.Errorf("missing %s", bodyTagEnd)
+
+	body := ""
+	suffix := ""
+	if bodyEnd := strings.Index(text[bodyStart:], bodyTagEnd); bodyEnd != -1 {
+		bodyEnd += bodyStart
+		body = trimTagNewlines(text[bodyStart:bodyEnd])
+		suffix = strings.TrimSpace(text[bodyEnd+len(bodyTagEnd):])
+	} else {
+		// Some model replies omit the final closing tag but otherwise return a
+		// valid document. Treat EOF as the end of <body> so doc retries do not
+		// burn through the whole workflow on a recoverable formatting slip.
+		body = trimTagNewlines(text[bodyStart:])
 	}
-	bodyEnd += bodyStart

 	prefix := strings.TrimSpace(text[:frontStart-len(frontmatterTagStart)])
-	suffix := strings.TrimSpace(text[bodyEnd+len(bodyTagEnd):])
 	if prefix != "" || suffix != "" {
 		return "", "", fmt.Errorf("unexpected text outside tagged sections")
 	}

 	frontMatter := trimTagNewlines(text[frontStart:frontEnd])
-	body := trimTagNewlines(text[bodyStart:bodyEnd])
 	return frontMatter, body, nil
 }

--- a/scripts/docs-i18n/doc_mode_test.go
+++ b/scripts/docs-i18n/doc_mode_test.go
@@ -0,0 +1,31 @@
+package main
+
+import "testing"
+
+func TestParseTaggedDocumentAcceptsMissingBodyCloseAtEOF(t *testing.T) {
+	t.Parallel()
+
+	input := "<frontmatter>\ntitle: Test\n</frontmatter>\n<body>\nTranslated body\n"
+
+	front, body, err := parseTaggedDocument(input)
+	if err != nil {
+		t.Fatalf("parseTaggedDocument returned error: %v", err)
+	}
+	if front != "title: Test" {
+		t.Fatalf("unexpected frontmatter %q", front)
+	}
+	if body != "Translated body" {
+		t.Fatalf("unexpected body %q", body)
+	}
+}
+
+func TestParseTaggedDocumentRejectsTrailingTextOutsideTags(t *testing.T) {
+	t.Parallel()
+
+	input := "<frontmatter>\ntitle: Test\n</frontmatter>\n<body>\nTranslated body\n</body>\nextra"
+
+	_, _, err := parseTaggedDocument(input)
+	if err == nil {
+		t.Fatal("expected error for trailing text")
+	}
+}