加拼音(新格式)-代码

2020-07-16  本文已影响0人  姜附

word文档全篇加拼音(批量注音)+批量修改读音

加拼音(新格式)-代码

package main

import (
    "archive/zip"
    "runtime"

    "bufio"
    "bytes"
    "errors"
    "fmt"
    "io"
    "io/ioutil"
    "os"
    "path"
    _ "regexp"
    "strconv"
    "strings"
    "time"
    "unicode"

    "golang.org/x/text/encoding/simplifiedchinese"
    "golang.org/x/text/transform"

    // "github.com/aurelien-rainone/assertgo"
    "github.com/etree"
    "github.com/mozillazg/go-pinyin"
)

func assert(cond bool, failPrompt string) {
    if !cond {
        pc, file, line, _ := runtime.Caller(1)
        pcName := runtime.FuncForPC(pc).Name()
        panic(file + ":" + strconv.Itoa(line) + ":" + pcName + "  " + failPrompt)
    }
}

var g_configPinyinFont string = "微软雅黑"
var g_configPinyinFontSize int = -1 // 使用值是字面值*2
// var g_configPinyinUseFontSize bool = false // 是否使用统一的拼音字号
var g_configPinyinAlignmentMap map[string]string = map[string]string{
    "居中":    "center",
    "0-1-0": "distributeLetter",
    "1-2-1": "distributeSpace",
    "左对齐":   "left",
    "右对齐":   "right",
}

var g_configPinyinAlignment = "center" // 对齐方式,默认为居中

var g_configPinyinRaise int = -1 // 拼音偏移

func parseConfigLine(line string) error {

    line2 := strings.TrimSpace(line)
    vals := strings.Split(line2, "=")
    assert(2 == len(vals), "无效配置:"+line)

    switch vals[0] {
    case "拼音字体":
        {
            if "" != vals[1] {
                g_configPinyinFont = vals[1]
            }
        }
    case "拼音字号":
        {
            if "" != vals[1] {
                v, err := strconv.Atoi(vals[1])
                assert((nil == err) && (1 <= v), "无效配置:"+line)
                // return errors.New(fmt.Sprintf("字号无效:%s: %s\r\n", line, err))

                // g_configPinyinUseFontSize = true
                g_configPinyinFontSize = v
            }

        }
    case "拼音对齐":
        {
            if "" != vals[1] {
                v := g_configPinyinAlignmentMap[vals[1]]
                assert("" != v, "无效配置:"+line)
                g_configPinyinAlignment = v
            }
        }
    case "拼音偏移":
        {
            v, err := strconv.Atoi(vals[1])
            assert((nil == err) && (0 <= v), "无效配置:"+line)
            g_configPinyinRaise = v
        }
    default:
        {
            assert(false, "无效配置:"+line)
        }
    }

    return nil
}

func parseConfig() error {
    fileName := "拼音配置.txt"
    fileinfo, err := os.Stat(fileName)
    if nil != err {
        // 文件不存在,创建一个新的,并写入注释说明
        newFile, err := os.Create(fileName)
        if err != nil {
            return err
        }
        defer newFile.Close()
        newFile.WriteString(
            fmt.Sprintf(
                // 默认不填写具体值
                `# 说明:只支持docx格式的word文档;
# 拼音字号默认动态变化,指定字号后全篇拼音使用指定字号;
拼音字体=
拼音字号=
拼音对齐=
拼音偏移=
`,
                // g_configPinyinFont,
                // g_configPinyinFontSize/2,
                // g_configPinyinOffset,
            ),
            // fmt.Sprintf("# 说明:只支持docx格式的word文档\r\n拼音字体=%s",
            //  g_configPinyinFont,
            // ),
        )
    } else if fileinfo.IsDir() {
        return errors.New("无法创建配置文件:" + fileName)
    } else {

        f, err := os.Open(fileName)
        if err != nil {
            return nil
        }

        br := bufio.NewReader(f)
        for {
            line, _, err := br.ReadLine()
            if err == io.EOF {
                break
            }

            // line = strings.TrimSpace(line)
            strLine := strings.ReplaceAll(string(line), " ", "") // 去掉所有空格

            if "" == strLine {
                continue
            }

            // strLine := strings.TrimLeft(string(line), " ")
            if strings.HasPrefix(strLine, "#") {
                continue
            }

            if err = parseConfigLine(strLine); nil != err {
                return err
            }
        }

    }
    return nil
}

// func createWrPrNode(oldWrPrNode *etree.Element) *etree.Element {
//  newWrPr := oldWrPrNode.Copy()

//  wrFonts := newWrPr.FindElement("w:rFonts")
//  if nil == wrFonts {
//      // panic(fileLine() + "no w:rFonts")
//      // wrFonts = newWrPr.CreateElement("w:rFonts")
//  } else {
//      whint := wrFonts.SelectAttr("w:hint")
//      if nil != whint {
//          whint.Value = "default"
//      } else {
//          wrFonts.CreateAttr("w:hint", "default")
//      }
//  }

//  return newWrPr
// }

// func createWrBegin(wrPrNode *etree.Element) *etree.Element {

//  wr := etree.NewElement("w:r")
//  if nil != wrPrNode {
//      wr.AddChild(createWrPrNode(wrPrNode))
//  }

//  wfldChar := etree.NewElement("w:fldChar")
//  wfldChar.CreateAttr("w:fldCharType", "begin")
//  wr.AddChild(wfldChar)

//  return wr
// }

// func createWrinstrText(text string, wrPrNode *etree.Element) *etree.Element {
//  if nil == wrPrNode {
//      // panic(fileLine() + "nil == wrPrNode")
//  }

//  wr := etree.NewElement("w:r")

//  if nil != wrPrNode {
//      wr.AddChild(createWrPrNode(wrPrNode))
//  }

//  winstrText := etree.NewElement("w:instrText")
//  winstrText.CreateAttr("xml:space", "preserve")
//  winstrText.SetText(text)
//  // fmt.Println(wfldChar.Text())
//  wr.AddChild(winstrText)

//  return wr
// }

// func createWrEnd(wrPrNode *etree.Element) *etree.Element {
//  wr := etree.NewElement("w:r")

//  if nil != wrPrNode {
//      wr.AddChild(createWrPrNode(wrPrNode))
//  }

//  wfldChar := etree.NewElement("w:fldChar")
//  wfldChar.CreateAttr("w:fldCharType", "end")
//  wr.AddChild(wfldChar)

//  return wr
// }

func create__w_r__w_t(w string, oldWrNode *etree.Element) *etree.Element {
    wr := oldWrNode.Copy()

    wt := wr.FindElement("w:t")
    if nil == wt {
        wt = etree.NewElement("w:t")
        wr.AddChild(wt)
    }
    wt.SetText(w)

    return wr
}

// "一"字
// 單用或在一詞一句的末尾,念陰平聲;
// 在去聲字前,念陽平聲;
// 在陰平、陽平、上聲之前,念去聲。
// 阴阳上去分别为第一二三四声
func procYI(nextHan string, nextPinyin string) string {

    // 句末
    if ("" == nextHan) || !unicode.Is(unicode.Han, []rune(nextHan)[0]) {
        return "yī"
    }

    assert("" != nextPinyin, "nextPinyin为空")

    // 后面跟着汉字
    if strings.ContainsAny(nextPinyin, "àòèìùǜ") {
        return "yí"
    }

    return "yì"
}

// 「不」字
// 在去聲字之前,變讀為陽平
// 阴阳上去分别为第一二三四声
func procBU(nextHan string, nextPinyin string) string {

    if ("" != nextHan) && strings.ContainsAny(nextPinyin, "àòèìùǜ") {
        assert("" != nextPinyin, "nextPinyin为空")

        return "bú"
    }

    return "bù"
}

func createNodeAttr(tag, attrName, attrValue string) *etree.Element {
    newTag := etree.NewElement(tag)
    newTag.CreateAttr(attrName, attrValue)
    return newTag
}

func create__w_rubyPr(fontSize int, pinyinFontSize int) *etree.Element {

    w_rubyPr := etree.NewElement("w:rubyPr")
    w_rubyPr.AddChild(createNodeAttr("w:rubyAlign", "w:val", g_configPinyinAlignment))
    w_rubyPr.AddChild(createNodeAttr("w:hps", "w:val", strconv.Itoa(pinyinFontSize)))

    var pinyinRaise int
    if -1 != g_configPinyinRaise {
        pinyinRaise = fontSize - 2 + g_configPinyinRaise*2 //- 2 // 如果设置了偏移,要多减一个2,不知道为什么。。。
    } else {
        pinyinRaise = fontSize - 2
    }

    w_rubyPr.AddChild(createNodeAttr("w:hpsRaise", "w:val", strconv.Itoa(pinyinRaise)))
    w_rubyPr.AddChild(createNodeAttr("w:hpsBaseText", "w:val", strconv.Itoa(fontSize)))
    w_rubyPr.AddChild(createNodeAttr("w:lid", "w:val", "zh-CN"))

    return w_rubyPr
}

func set_attr(node *etree.Element, attr_name string, value string) {
    if attr := node.SelectAttr(attr_name); nil != attr {
        attr.Value = value
    } else {
        node.CreateAttr(attr_name, value)
    }
    return
}

func create__w_rt(w_rPr *etree.Element, fontSize int,
    pinyinFontSize int, pinyin string) *etree.Element {
    w_rt := etree.NewElement("w:rt")
    w_r := w_rt.CreateElement("w:r")

    var new__w_rPr *etree.Element
    if nil == w_rPr {
        new__w_rPr = w_r.CreateElement("w:rPr")
    } else {
        new__w_rPr = w_rPr.Copy()
        w_r.AddChild(new__w_rPr)
    }

    w_rFonts := new__w_rPr.FindElement("w:rFonts")
    if nil == w_rFonts {
        w_rFonts = new__w_rPr.CreateElement("w:rFonts")
    }

    set_attr(w_rFonts, "w:ascii", g_configPinyinFont)
    set_attr(w_rFonts, "w:eastAsia", g_configPinyinFont)
    set_attr(w_rFonts, "w:hAnsi", g_configPinyinFont)

    // w_rFonts.RemoveAttr("w:hint")
    // w_rFonts.RemoveAttr("w:hAnsi") // for test

    // 排一下序
    // remove_add_attr(w_rFonts, "w:ascii")
    // remove_add_attr(w_rFonts, "w:eastAsia")
    // remove_add_attr(w_rFonts, "w:hAnsi")
    // remove_add_attr(w_rFonts, "w:cs")

    w_sz := new__w_rPr.FindElement("w:sz")
    if nil == w_sz {
        w_sz = new__w_rPr.CreateElement("w:sz")
    }

    // 调整一下顺序
    new__w_rPr.RemoveChild(w_sz)
    new__w_rPr.AddChild(w_sz)
    remove_then_add_child(new__w_rPr, "w:szCs")
    remove_then_add_child(new__w_rPr, "w:shd")

    w_val := w_sz.SelectAttr("w:val")
    if nil != w_val {
        w_val.Value = strconv.Itoa(pinyinFontSize)
    } else {
        w_sz.CreateAttr("w:val", strconv.Itoa(pinyinFontSize))
    }

    w_r.CreateElement("w:t").SetText(pinyin)

    return w_rt
}

func create__w_rubyBase(w_rPr *etree.Element, han string) *etree.Element {
    w_rubyBase := etree.NewElement("w:rubyBase")
    w_r := w_rubyBase.CreateElement("w:r")
    if nil != w_rPr {
        w_r.AddChild(w_rPr.Copy())
    }

    w_t := w_r.CreateElement("w:t")
    w_t.SetText(han)

    return w_rubyBase
}

func create__w_ruby(w_rPr *etree.Element, fontSize int,
    pinyin string, han string) *etree.Element {

    var pinyinFontSize int
    if -1 != g_configPinyinFontSize {
        pinyinFontSize = g_configPinyinFontSize * 2
    } else {
        pinyinFontSize = fontSize / 2
    }

    w_ruby := etree.NewElement("w:ruby")
    w_ruby.AddChild(create__w_rubyPr(fontSize, pinyinFontSize))
    w_ruby.AddChild(create__w_rt(w_rPr, fontSize, pinyinFontSize, pinyin)) // 拼音
    w_ruby.AddChild(create__w_rubyBase(w_rPr, han))                        // 汉字

    return w_ruby
}

func create__w_r(w_rPr *etree.Element, fontSize int,
    pinyin string, han string) *etree.Element {

    w_r := etree.NewElement("w:r")
    if nil != w_rPr {
        new__w_rPr := w_rPr.Copy()

        // if w_rFonts := new_w_rPr.FindElement("w:rFonts"); nil != w_rFonts {
        // new_w_rPr.RemoveChild(w_rFonts) // for test
        // }

        w_r.AddChild(new__w_rPr)
    }
    w_r.AddChild(create__w_ruby(w_rPr, fontSize, pinyin, han))
    return w_r
}

func remove_then_add_attr(node *etree.Element, name string) {
    // 删掉再加回去,用于调整顺序
    if attr := node.RemoveAttr(name); nil != attr {
        node.CreateAttr(name, attr.Value)
    }
    return
}

func remove_child(node *etree.Element, child_name string) {
    if child := node.FindElement(child_name); nil != child {
        node.RemoveChild(child)
    }
    return
}

func remove_then_add_child(node *etree.Element, child_name string) {
    if child := node.FindElement(child_name); nil != child {
        node.RemoveChild(child)
        node.AddChild(child)
    }
    return
}

func addPinyin(buf []byte) (string, error) {

    pinyinArg := pinyin.NewArgs()
    pinyinArg.Style = pinyin.Tone // 包含声调

    doc := etree.NewDocument()
    err := doc.ReadFromBytes(buf)
    if nil != err {
        fmt.Println(err)

        transformers := []transform.Transformer{
            simplifiedchinese.GBK.NewDecoder(),
            simplifiedchinese.HZGB2312.NewDecoder(),
        }

        fmt.Println("尝试转码")
        for _, t := range transformers {

            I := bytes.NewReader(buf)
            O := transform.NewReader(I, t)
            var d []byte
            d, err = ioutil.ReadAll(O)
            if nil != err {
                continue
            }

            err = doc.ReadFromBytes(d)
            if nil == err {
                fmt.Println("转码成功")
                break
            }
        }

        if nil != err {
            fmt.Println("转码失败")
            return "", err
        }
    }

    wdocument := doc.SelectElement("w:document")

    // w:p是一个段落,一段一段的处理
    for _, w_p := range wdocument.FindElements("w:body/w:p") {

        if w_p__w_pPr__w_rPr := w_p.FindElement("w:pPr/w:rPr"); nil != w_p__w_pPr__w_rPr {
            if w_rFonts := w_p__w_pPr__w_rPr.FindElement("w:rFonts"); nil != w_rFonts {
                // 排一下序
                remove_then_add_attr(w_rFonts, "w:ascii")
                remove_then_add_attr(w_rFonts, "w:eastAsia")
                remove_then_add_attr(w_rFonts, "w:hAnsi")
                remove_then_add_attr(w_rFonts, "w:cs")
                w_rFonts.RemoveAttr("w:hint")
            }

            remove_child(w_p__w_pPr__w_rPr, "w:lang")
            // remove_child(w_p__w_pPr__w_rPr, "w:szCs")

            // w_p__w_pPr__w_rPr = w_p__w_pPr__w_rPr.Copy()

            // w_lang.CreateAttr("w:lang", "en-US")
            // w_lang.CreateAttr("w:eastAsia", "zh-CN")
        }

        allStrOfWp := "" // 段内所有文字
        for _, wr := range w_p.FindElements("w:r") {
            if wt := wr.FindElement("w:t"); nil != wt {
                allStrOfWp += wt.Text()
            }
        }

        pinyins := pinyin.Pinyin(allStrOfWp, pinyinArg)
        pinyinIndex := 0
        allHansArr := strings.Split(allStrOfWp, "")
        hanIndex := 0

        for _, w_r := range w_p.FindElements("w:r") {

            fontSize := 21 // 默认值,word中看到默认字号是10.5,这里xml中的数字是word中可视参数的2倍。

            // if w_rPr := w_r.FindElement("w:rPr"); nil != w_rPr {
            //  w_r.RemoveChild("w:rPr")
            // }

            w_r__w_rPr := w_r.FindElement("w:rPr")
            if nil != w_r__w_rPr {

                if w_rFonts := w_r__w_rPr.FindElement("w:rFonts"); nil != w_rFonts {
                    w_rFonts.RemoveAttr("w:hint")
                    if 0 == len(w_rFonts.Attr) {
                        w_r__w_rPr.RemoveChild(w_rFonts)

                    } else {

                        // 排一下序
                        remove_then_add_attr(w_rFonts, "w:ascii")
                        remove_then_add_attr(w_rFonts, "w:eastAsia")
                        remove_then_add_attr(w_rFonts, "w:hAnsi")
                        remove_then_add_attr(w_rFonts, "w:cs")
                    }
                }

                if w_sz := w_r__w_rPr.FindElement("w:sz"); nil != w_sz {
                    if w_val := w_sz.SelectAttr("w:val"); nil != w_val {
                        if v, err := strconv.Atoi(w_val.Value); nil == err {
                            fontSize = v
                        }
                    }
                }

                // 调整一下顺序
                // remove_then_add_child(w_r__w_rPr, "w:szCs")

                if 0 == len(w_r__w_rPr.Child) {
                    w_r__w_rPr.Parent().RemoveChild(w_r__w_rPr)
                    w_r__w_rPr = nil
                }

                // if w_rFonts := w_r__w_rPr.FindElement("w:rFonts"); nil != w_rFonts {
                //  w_rFonts.CreateAttr("w:ascii", g_configPinyinFont)
                //  w_rFonts.CreateAttr("w:eastAsia", g_configPinyinFont)
                //  w_rFonts.CreateAttr("w:hAnsi", g_configPinyinFont)
                //  w_rFonts.CreateAttr("w:cs", g_configPinyinFont)
                // }
                // if w_lang := w_r__w_rPr.FindElement("w:lang"); nil != w_lang {
                //  w_r__w_rPr.RemoveChild(w_lang)
                // }
            }

            // var pinyinFontSize int
            // if -1 != g_configPinyinFontSize {
            //  pinyinFontSize = g_configPinyinFontSize
            // } else {
            //  pinyinFontSize = fontSize / 2
            // }

            // var pinyinRaise int
            // if -1 != g_configPinyinRaise {
            //  pinyinRaise = g_configPinyinRaise
            // } else {
            //  pinyinRaise = pinyinFontSize - 2
            // }

            if w_t := w_r.FindElement("w:t"); nil != w_t {
                text := w_t.Text()
                // fmt.Println(text)

                lastStr := ""

                for _, w := range text {
                    han := string(w)
                    assert(han == allHansArr[hanIndex], "出错啦")

                    if unicode.Is(unicode.Han, w) {
                        if "" != lastStr {
                            w_p.InsertChild(w_r, create__w_r__w_t(lastStr, w_r))
                            lastStr = ""
                        }

                        nextHan := ""
                        nextPinyin := ""
                        if hanIndex+1 < len(allHansArr) {
                            nextHan = allHansArr[hanIndex+1]
                            if unicode.Is(unicode.Han, []rune(nextHan)[0]) {
                                nextPinyin = pinyins[pinyinIndex+1][0]
                            }
                        }

                        var pinyin string
                        switch han {
                        case "一":
                            {
                                pinyin = procYI(nextHan, nextPinyin)
                            }
                        case "不":
                            {
                                pinyin = procBU(nextHan, nextPinyin)
                            }
                        default:
                            {
                                pinyin = pinyins[pinyinIndex][0]
                            }
                        }

                        w_r.Parent().InsertChild(w_r, create__w_r(w_r__w_rPr, fontSize, pinyin, han))

                        pinyinIndex++

                    } else {
                        lastStr += han
                    }

                    hanIndex++
                }

                if "" != lastStr {
                    w_p.InsertChild(w_r, create__w_r__w_t(lastStr, w_r))
                    lastStr = ""
                }

                w_r.Parent().RemoveChild(w_r)
            }
        }
    }

    return doc.WriteToString()
}

func procOneDocxFile(fromPath string, toPath string) error {
    zipReader, err := zip.OpenReader(fromPath)
    if err != nil {
        fmt.Print(err)
        return err
    }
    defer zipReader.Close()

    newZipFile, err := os.Create(toPath)
    if err != nil {
        fmt.Println(err)
        return err
    }
    defer newZipFile.Close()

    zipWriter := zip.NewWriter(newZipFile)
    defer zipWriter.Close()

    var f *zip.File
    for _, file := range zipReader.File {

        rc, err := file.Open()
        if nil != err {
            return err
        }

        buf := make([]byte, file.UncompressedSize)

        // zipfile文件一次可能不能读完,循环读完为止
        readLen := 0
        for file.UncompressedSize != uint32(readLen) {
            n, err := rc.Read(buf[readLen:])
            if nil != err && (0 != strings.Compare("EOF", err.Error())) {
                fmt.Println(err)
                return err
            }
            if 0 == n {
                return errors.New("读取zip出错")
            }
            readLen += n
        }

        var newBuf []byte
        if "word/document.xml" == file.Name {
            f = file

            assert(file.UncompressedSize == uint32(readLen), "读取错误")

            newXmlStr, err := addPinyin(buf)
            if nil != err {
                return err
            }

            newBuf = []byte(newXmlStr)

        } else {
            newBuf = buf

        }

        newFile, err := zipWriter.Create(file.Name)
        if err != nil {
            return err
        }

        _, err = newFile.Write(newBuf)
        if err != nil {
            return err
        }
    }

    if nil == f {
        err = errors.New(fromPath + ": 没有 word/document.xml")
        return err
    }

    return nil
}

func initDir(paths []string) error {

    for _, path := range paths {
        fileinfo, err := os.Stat(path)
        if nil != err {
            err = os.Mkdir(path, os.ModePerm)
            if err != nil {
                fmt.Println(err)
                return err
            }
        } else if !fileinfo.IsDir() {
            return errors.New("无法创建目录:" + path)
        }
    }

    return nil
}

func main() {

    var err error
    var startTime, endTime time.Time
    startTime = time.Now()
    defer func() {
        if p := recover(); nil != p {
            fmt.Printf("panic recover! : %v\r\n", p)
        }
        if nil != err {
            fmt.Printf("error : %v\r\n", err)
        }

        endTime = time.Now()
        fmt.Println("耗时:", endTime.Sub(startTime))
        fmt.Println("按任意键结束")
        var data int
        fmt.Scanf("%d", &data)

        return
    }()

    todoDir := "./1-加拼音的docx-待处理"
    doneDir := "./2-加拼音的docx-结果"
    if err = initDir([]string{todoDir, doneDir}); nil != err {
        return
    }

    if err = parseConfig(); nil != err {
        return
    }

    files, err := ioutil.ReadDir(todoDir)
    if nil != err {
        return
    }
    for _, f := range files {

        if f.IsDir() {
            continue
        }
        if strings.HasPrefix(path.Base(f.Name()), "~$") {
            continue
        }

        ext := path.Ext(f.Name())
        if !strings.EqualFold(".docx", ext) {
            continue
        }

        fmt.Println("正在处理文件:" + f.Name())

        fromPath := todoDir + "/" + f.Name()
        toPath := doneDir + "/" + strings.TrimSuffix(path.Base(f.Name()), ext) + time.Now().Format("_20060102_150405.docx")
        err = procOneDocxFile(fromPath, toPath)
        if nil != err {
            return
        }

    }

    return
}

上一篇 下一篇

猜你喜欢

热点阅读