小作品: Python 命令行词典,含 15 万离线词库(附源码
2016-09-09 本文已影响1898人
CasparTse
python-translate(Python 命令行词典)
python-translate 是一个简单的命令行翻译工具,数据源自必应、有道及爱词霸翻译服务。
screenshot_v0.1.3.jpg代码说明
- Python 版本
python 2.6 + - 演示环境
BunsenLabs Linux Hydrogen (Debian GNU/Linux 8.5)
基本功能
- 英汉 / 汉英 翻译
- 拼写检查及拼写建议(仅英文)
- 数据存储 (使用 dbm 模块)
- 单词发音
使用方法
usage: translate.py [-h] [-n] [-p {espeak,festival}] [-s {bing,youdao,iciba}]
[-w] [-V]
word
positional arguments:
word word or 'some phrase'
optional arguments:
-h, --help show this help message and exit
-n, --nostorage turn off data storage
-p {espeak,festival,real}, --pronounce {espeak,festival,real}
text-to-speech software: 'espeak', 'festival' or 'real'
-s {bing,youdao,iciba}, --service {bing,youdao,iciba}
translate service: 'bing', 'youdao' or 'iciba'
-w, --webonly ignore local data
-V, --version show program's version number and exit
-
关于查询结果保存
默认保存查询结果,如需关闭,可使用-n
或--nostorage
选项。
$ python2 translate.py hello -n
-
关于本地数据使用
默认使用本地数据库,如需关闭,可使用-w
或--webonly
选项。
$ python2 translate.py hello -w
-
关于翻译服务选择
可使用-s
或--service
选项指定翻译服务:bing | youdao | iciba ,默认使用必应翻译。以下三种表示方法均有效:
$ python2 translate.py hello -s=youdao
$ python2 translate.py hello -s youdao
$ python2 translate.py hello -syoudao
若该选项非空,则 webonly
会自动开启,即不使用本地数据库。
-
关于单词发音
单词发音功能默认关闭,如需启用,可使用-p
或--pronounce
选项,选择具体的软件发音: espeak | festival 。
另外 TTS 合成语音效果一般,若有真人语音文件,可配合 aplay、mpg321、sox 等命令使用,可修改源码中的pronounce
部分以更改的发音配置。
p.s. 语音资源可搜索 "OtdRealPeopleTTS"、"WyabdcRealPeopleTTS" 等关键词。
$ python2 translate.py hello -p=espeak
$ python2 translate.py hello -p=festival
$ python2 translate.py hello -p=real
库依赖 & 软件支持
$ pip install requests beautifulsoup4 lxml pyenchant
# OR
$ pip install -r requirements.txt
$ sudo apt-get install libxml2-dev libxslt-dev python-dev espeak festival alsa-base alsa-utils
小贴士
- 设置命令别名
$ alias t="python2 /path/to/the/translate.py"
$ alias te="t -p=espeak"
$ alias tf="t -p=festival"
$ alias tr="t -p=real"
$ alias tb="t -s=bing"
$ alias ty="t -s=youdao"
$ alias ti="t -s=iciba"
- data 文件夹内包含了 15 万英文单词的翻译结果
- 修改 hosts 配置可加速在线查询,参考 test 文件夹中的 hosts 文件
- 预先批量查询并保存结果,可作离线词典使用,单词列表见 spell-checker 文件夹
更多资源
- SCOWL (Spell Checker Oriented Word Lists)
- List of spell checkers
- Wiktionary:Frequency lists
- wordlist.10000
- top10000en.txt
- google-10000-english
源码(v0.1.3)
#!/usr/bin/env python
# -*- coding:utf-8 -*
import os
import argparse
import dbm
import re
from multiprocessing.dummy import Pool as ThreadPool
from multiprocessing import Process
class Bing(object):
def __init__(self):
super(Bing, self).__init__()
def query(self, word):
import requests
from bs4 import BeautifulSoup
sess = requests.Session()
headers = {
'Host': 'cn.bing.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
}
sess.headers.update(headers)
url = 'http://cn.bing.com/dict/SerpHoverTrans?q=%s' % (word)
try:
resp = sess.get(url, timeout=100)
except:
return None
text = resp.text
if (resp.status_code == 200) and (text):
soup = BeautifulSoup(text, 'lxml')
if soup.find('h4').text.strip() != word.decode('utf-8'):
return None
lis = soup.find_all('li')
trans = []
for item in lis:
transText = item.get_text()
if transText:
trans.append(transText)
return '\n'.join(trans)
else:
return None
class Youdao(object):
def __init__(self):
super(Youdao, self).__init__()
def query(self, word):
import requests
try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
sess = requests.Session()
headers = {
'Host': 'dict.youdao.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate'
}
sess.headers.update(headers)
url = 'http://dict.youdao.com/fsearch?q=%s' % (word)
try:
resp = sess.get(url, timeout=100)
except:
return None
text = resp.content
if (resp.status_code == 200) and (text):
tree = ET.ElementTree(ET.fromstring(text))
returnPhrase = tree.find('return-phrase')
if returnPhrase.text.strip() != word.decode('utf-8'):
return None
customTranslation = tree.find('custom-translation')
if not customTranslation:
return None
trans = []
for t in customTranslation.findall('translation'):
transText = t[0].text
if transText:
trans.append(transText)
return '\n'.join(trans)
else:
return None
class Iciba(object):
def __init__(self):
super(Iciba, self).__init__()
def query(self, word):
import requests
from bs4 import BeautifulSoup
sess = requests.Session()
headers = {
'Host': 'open.iciba.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate'
}
sess.headers.update(headers)
url = 'http://open.iciba.com/huaci_new/dict.php?word=%s' % (word)
try:
resp = sess.get(url, timeout=100)
text = resp.text
pattern = r'(<div class=\\\"icIBahyI-group_pos\\\">[\s\S]+?</div>)'
text = re.search(pattern, text).group(1)
except:
return None
if (resp.status_code == 200) and (text):
soup = BeautifulSoup(text, 'lxml')
ps = soup.find_all('p')
trans = []
for item in ps:
transText = item.get_text()
transText = re.sub(
r'\s+', ' ', transText.replace('\t', '')).strip()
if transText:
trans.append(transText)
return '\n'.join(trans)
else:
return None
path = os.path.dirname(os.path.realpath(__file__))
db = dbm.open(path + '/data/vocabulary', 'c')
DEFAULT_SERVICE = 'bing'
class Client(object):
def __init__(self, word, service=None, webonly=False):
super(Client, self).__init__()
if not service:
service = DEFAULT_SERVICE
self.service = service
self.word = word
self.trans = None
if webonly:
self.db = {}
else:
self.db = db
def translate(self):
trans = self.db.get(self.word)
if trans:
return trans
else:
if self.service == 'bing':
S = Bing()
if self.service == 'youdao':
S = Youdao()
elif self.service == 'iciba':
S = Iciba()
trans = S.query(self.word)
self.trans = trans
return trans
def suggest(self):
if re.sub(r'[a-zA-Z\d\'\-\.\s]', '', self.word):
return None
import enchant
try:
d = enchant.DictWithPWL(
'en_US', path + '/data/spell-checker/american-english-large')
except:
d = enchant.Dict('en_US')
suggestion = d.suggest(self.word)
return suggestion
def pronounce(self, tts):
if tts == 'festival':
cmd = ' echo "%s" | festival --tts > /dev/null 2>&1' % (self.word)
elif tts == 'espeak':
cmd = 'espeak -v en-us "%s" > /dev/null 2>&1' % (self.word)
elif tts == 'real':
cmd = 'find %s/data/RealPeopleTTS/ -type f -iname "%s.wav" | head -n1 | xargs -I {} aplay {} > /dev/null 2>&1' % (
path, self.word)
import commands
try:
status, output = commands.getstatusoutput(cmd)
except:
pass
return True
def updateDB(self):
if self.trans:
db[self.word] = self.trans.encode('utf-8')
db.close()
return True
def parseArgs():
parser = argparse.ArgumentParser()
parser.add_argument('word', help="word or 'some phrase'")
parser.add_argument('-n', '--nostorage', dest='nostorage',
action='store_true', help='turn off data storage')
parser.add_argument('-p', '--pronounce', dest='pronounce', choices=[
'espeak', 'festival', 'real'], help="text-to-speech software: 'espeak', 'festival' or 'real'")
parser.add_argument('-s', '--service', dest='service', choices=[
'bing', 'youdao', 'iciba'], help="translate service: 'bing', 'youdao' or 'iciba'")
parser.add_argument('-w', '--webonly', dest='webonly',
action='store_true', help='ignore local data')
parser.add_argument('-V', '--version', action='version',
version='%(prog)s 0.1.3')
return parser.parse_args()
if __name__ == '__main__':
args = parseArgs()
word = args.word.strip()
service = args.service
webonly = args.webonly
if service:
webonly = True
C = Client(word, service=service, webonly=webonly)
pool = ThreadPool()
_trans = pool.apply_async(C.translate)
_suggestion = pool.apply_async(C.suggest)
trans = _trans.get()
if trans:
print trans
if args.pronounce:
p1 = Process(target=C.pronounce, args=(args.pronounce,))
p1.daemon = True
p1.start()
if not args.nostorage:
p2 = Process(target=C.updateDB)
p2.daemon = True
p2.start()
else:
suggestion = _suggestion.get()
if not suggestion:
print 'No translations found for \"%s\" .' % (word)
else:
print 'No translations found for \"%s\", maybe you meant:\
\n\n%s' % (word, ' / '.join(suggestion))