英语单词功能实现

2018-01-19  本文已影响0人  鸿雁长飞光不度

从百度文库找一份单词表内容格式如下:

原始单词.png

经过php代码处理提取单词并保存到新的文件

$file = fopen("word.txt", "r");
$words=array();
$new_file = fopen('new_word.txt','a');

$i=0;
//输出文本中所有的行,直到文件结束为止。
while(! feof($file))
{
    $str= fgets($file);//fgets()函数从文件指针中读取一行
     preg_match('/^([a-zA-Z]+)\s+/',$str,$matches);
     if (!empty($matches[1])){
         $words[$i] = $matches[1];
     }
     $words[$i] = preg_replace('# #', '', $words[$i]);
     if (!empty($words[$i])){
         echo $words[$i] ."<br>";
         $words[$i] = $words[$i]."\n";
         fputs($new_file,$words[$i],strlen($words[$i]));
     }
     $i++;
}
fclose($file);
fclose($new_file);

文件内容如下:

新的文件.png

获取音频

$file = fopen("new_word.txt", "r");
$words=array();
$i=0;
//输出文本中所有的行,直到文件结束为止。
while(! feof($file))
{
    $str= fgets($file);//fgets()函数从文件指针中读取一行
    echo $str;
    $str = substr($str,0,strlen($str)-1);
    echo $str;
    $output = file_get_contents("http://dict.youdao.com/dictvoice?audio=$str&type=2");
    file_put_contents("./records/$str".".mp3",$output);
}
fclose($file);
image.png

抓取界面

    function get_word_msg($word_url, $word)
    {
        file_put_contents(__DIR__ . '/htmls/' . $word . ".html", file_get_contents($word_url));
        $html = new simple_html_dom();
        $html->load_file(__DIR__ . '/htmls/' . $word . ".html");
        $web_word = $html->find('#cigencizui-word', 0)->plaintext;
        if (strcasecmp($web_word, $word) != 0) {
            var_dump($web_word);
            var_dump($word);
            return $this->error('出错');
        }
        $html->clear();
        sleep(1);
    }

    function grab_word($word)
    {
        $curl = curl_init();
        curl_setopt($curl, CURLOPT_URL, "http://www.dicts.cn/dict/dict/dict!searchhtml3.asp?id=$word");
        curl_setopt($curl, CURLOPT_HEADER, 1);
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
        $data = curl_exec($curl);
        curl_close($curl);
        $data = strstr($data, 'dictword');
        $real_url = "http://www.dicts.cn/" . $data;
        $this->get_word_msg($real_url,$word);
    }
   // 通过循环可以抓取所有单词

利用simple_html_dom分析网页抓取内容

    public function grab_word_act(){

        if (!is_dir(__DIR__ . "/htmls")){
            mkdir(__DIR__ . "/htmls");
        }
        if (!is_dir(__DIR__ . "/images")){
            mkdir(__DIR__ . "/images");
        }
        $file = fopen(__DIR__."/new_word.txt", "r");
        $i = 0 ;
        while(!feof($file)) {
            $str = fgets($file);
            $str = substr($str, 0, strlen($str) - 1);
            if (file_exists(__DIR__ . '/htmls/' . $str . ".html")) {
                $this->analysis_word($str);
            }
            $i++;
        }
        Db::table('h_dict_word')->insertAll(self::$words);
        fclose($file);
    }

    public function analysis_word($word)
    {
        $html = new simple_html_dom();
        $html->load_file(__DIR__ . '/htmls/' . $word . ".html");
        $yinbiao = $html->find('#cigencizui-word-pron>.en-UK', 0)->innertext;
        $word_mean = $html->find('#cigencizui-word-info ul', 0)->innertext;
        $data = $this->getEmptyArray(array('source', 'story', 'dictionary', 'symbol', 'mean', 'name','remember'));
        $web_word = $html->find('#cigencizui-word', 0)->plaintext;
        if (strcasecmp($web_word, $word) != 0) {
            return;
        }
        $data['name'] = $word;
        $data['symbol'] = $yinbiao;
        $data['mean'] = $word_mean;
        $divs = $html->find('#cigencizui-content .page-header~div');
        if (!empty($divs)) {
            $flag = "";
            foreach ($divs as $item) {
                if (strpos($item->plaintext, '词源说明') === 0) {
                    $flag = "source";
                } else if (strpos($item->plaintext, '21世纪大') === 0) {
                    $flag = 'dictionary';
                    $data['dictionary'] = array();
                } else if (strpos($item->plaintext, "不拘一格背单词") === 0) {
                    $flag = "remember";
                } else if(strpos($item->plaintext, "词源故事")===0 ){
                    $flag = 'story';
                } else {
                    if ($flag == 'source') {
                        $data['source'] .= $item->innertext;
                    } else if ($flag == 'remember') {
                        $data['remember'] .= $item->innertext;
                    }
                    else if($flag == 'story'){
                        $data['story'] .= $item->innertext;
                    }
                }
            }
            if (array_key_exists('dictionary', $data)) {
                $spans = $html->find('#cigencizui-content .word');
                foreach ($spans as $item) {
                    $data['dictionary'] = $item->innertext;
                }
            }
        }

        self::$words[] = $data;
        if (count(self::$words)==10){
            Db::table('h_dict_word')->insertAll(self::$words);
            self::$words = array();
        }
        $html->clear();
    }
image.png
上一篇下一篇

猜你喜欢

热点阅读