利用原生php方法，把docx文件内容项重复的查出来

2021-01-06 本文已影响0人江江简书
原本想用phpword来读我的word文件的，奈何死活都不成功，也不知道是为什么，因此在互联网的大洋中寻找解决方案，终于是找到了一个解决的方案了，

https://www.cnblogs.com/5aiQ/p/14238239.html //参考文章
https://github.com/pintobikez/wordphp //这个也感觉像是原文
思路解读

大概看了一下这个类的代码基本是利用原生的zip包来解决的，但是至于原理的话我也不是太懂，所以就抄了实现了需求先。
目前只支撑docx，如果要读doc文件，我试了直接强转成docx，问题不大
主要用这个功能来解决的问题就是，例如有1000个人每人发了一份报名表给你，那你怎样快速的辨别这其中报名表中是否有重复的，如果是一份份登记出来的话，那得搞到猴年马月，加班加到你怀疑人生而且在重复的操作
<?php
/**
 * Created: jiangshiwen
 * Date: 2021/1/6
 * Time: 15:03
 * Theme:
 */
class WordPHP
{
    private $debug = false;
    private $file;
    private $rels_xml;
    private $doc_xml;
    private $doc_media = [];
    private $last = 'none';
//    private $encoding = 'ISO-8859-1';
    private $encoding = 'UTF-8';
    private $tmpDir = 'tmp';

    /**
     * CONSTRUCTOR
     *
     * @param Boolean $debug Debug mode or not
     * @return void
     */
    public function __construct($debug_=null, $encoding=null)
    {
        if($debug_ != null) {
            $this->debug = $debug_;
        }
        if ($encoding != null) {
            $this->encoding = $encoding;
        }
        $this->tmpDir = dirname(__FILE__);
    }

    /**
     * Sets the tmp directory where images will be stored
     *
     * @param string $tmp The location
     * @return void
     */
    private function setTmpDir($tmp)
    {
        $this->tmpDir = $tmp;
    }

    /**
     * READS The Document and Relationships into separated XML files
     *
     * @param var $object The class variable to set as DOMDocument
     * @param var $xml The xml file
     * @param string $encoding The encoding to be used
     * @return void
     */
    private function setXmlParts(&$object, $xml, $encoding)
    {
        $object = new DOMDocument();
        $object->encoding = $encoding;
        $object->preserveWhiteSpace = false;
        $object->formatOutput = true;
        $object->loadXML($xml);
        $object->saveXML();
    }

    /**
     * READS The Document and Relationships into separated XML files
     *
     * @param String $filename The filename
     * @return void
     */
    private function readZipPart($filename)
    {
        $zip = new ZipArchive();
        $_xml = 'word/document.xml';
        $_xml_rels = 'word/_rels/document.xml.rels';
//        var_dump($zip->open($filename));exit;
        if (true === $zip->open($filename)) {
            if (($index = $zip->locateName($_xml)) !== false) {
                $xml = $zip->getFromIndex($index);
            }
            //Get the relationships
            if (($index = $zip->locateName($_xml_rels)) !== false) {
                $xml_rels = $zip->getFromIndex($index);
            }
            // load all images if they exist
            for ($i=0; $i<$zip->numFiles;$i++) {
                $zip_element = $zip->statIndex($i);
                if(preg_match("([^\s]+(\.(?i)(jpg|jpeg|png|gif|bmp))$)",$zip_element['name'])) {
                    $this->doc_media[$zip_element['name']] = $zip_element['name'];
                }
            }
            $zip->close();
        } else die('non zip file');

        $enc = mb_detect_encoding($xml);
        $this->setXmlParts($this->doc_xml, $xml, $enc);
        $this->setXmlParts($this->rels_xml, $xml_rels, $enc);

        if($this->debug) {
            echo "<textarea style='width:100%; height: 200px;'>";
            echo $this->doc_xml->saveXML();
            echo "</textarea>";
            echo "<textarea style='width:100%; height: 200px;'>";
            echo $this->rels_xml->saveXML();
            echo "</textarea>";
        }
    }

    /**
     * CHECKS THE FONT FORMATTING OF A GIVEN ELEMENT
     * Currently checks and formats: bold, italic, underline, background color and font family
     *
     * @param XML $xml The XML node
     * @return String HTML formatted code
     */
    private function checkFormating(&$xml)
    {
        $node = trim($xml->readOuterXML());
        $t = '';
        // add <br> tags
        if (strstr($node,'<w:br ')) $t = '<br>';
        // look for formatting tags
        $f = "<span style='";
        $reader = new XMLReader();
        $reader->XML($node);
        $img = null;

        while ($reader->read()) {
            if($reader->name == "w:b") {
                $f .= "font-weight: bold,";
            }
            if($reader->name == "w:i") {
                $f .= "text-decoration: underline,";
            }
            if($reader->name == "w:color") {
                $f .="color: #".$reader->getAttribute("w:val").",";
            }
            if($reader->name == "w:rFont") {
                $f .="font-family: #".$reader->getAttribute("w:ascii").",";
            }
            if($reader->name == "w:shd" && $reader->getAttribute("w:val") != "clear" && $reader->getAttribute("w:fill") != "000000") {
                $f .="background-color: #".$reader->getAttribute("w:fill").",";
            }
            if($reader->name == 'w:drawing' && !empty($reader->readInnerXml())) {
                $r = $this->checkImageFormating($reader);
                $img = $r !== null ? "<image src='".$r."' />" : null;
            }
        }

        $f = rtrim($f, ',');
        $f .= "'>";
        $t .= ($img !== null ? $img : htmlentities($xml->expand()->textContent));

        return $f.$t."</span>";
    }

    /**
     * CHECKS THE ELEMENT FOR UL ELEMENTS
     * Currently under development
     *
     * @param XML $xml The XML node
     * @return String HTML formatted code
     */
    private function getListFormating(&$xml)
    {
        $node = trim($xml->readOuterXML());

        $reader = new XMLReader();
        $reader->XML($node);
        $ret=[];
        $close = "";
        while ($reader->read()){
            if($reader->name == "w:numPr" && $reader->nodeType == XMLReader::ELEMENT ) {

            }
            if($reader->name == "w:numId" && $reader->hasAttributes) {
                switch($reader->getAttribute("w:val")) {
                    case 1:
                        $ret['open'] = "<ol><li>";
                        $ret['close'] = "</li></ol>";
                        break;
                    case 2:
                        $ret['open'] = "<ul><li>";
                        $ret['close'] = "</li></ul>";
                        break;
                }

            }
        }
        return $ret;
    }

    /**
     * CHECKS IF THERE IS AN IMAGE PRESENT
     * Currently under development
     *
     * @param XML $xml The XML node
     * @return String The location of the image
     */
    private function checkImageFormating(&$xml)
    {
        $content = trim($xml->readInnerXml());

        if (!empty($content)) {

            $relId;
            $notfound = true;
            $reader = new XMLReader();
            $reader->XML($content);

            while ($reader->read() && $notfound) {
                if ($reader->name == "a:blip") {
                    $relId = $reader->getAttribute("r:embed");
                    $notfound = false;
                }
            }

            // image id found, get the image location
            if (!$notfound && $relId) {
                $reader = new XMLReader();
                $reader->XML($this->rels_xml->saveXML());

                while ($reader->read()) {
                    if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name=='Relationship') {
                        if($reader->getAttribute("Id") == $relId) {
                            $link = "word/".$reader->getAttribute('Target');
                            break;
                        }
                    }
                }

                $zip = new ZipArchive();
                $im = null;
                if (true === $zip->open($this->file)) {
                    $im = $this->createImage($zip->getFromName($link), $relId, $link);
                }
                $zip->close();
                return $im;
            }
        }

        return null;
    }

    /**
     * Creates an image in the filesystem
     *
     * @param objetc $image The image object
     * @param string $relId The image relationship Id
     * @param string $name The image name
     * @return Array With HTML open and closing tag definition
     */
    private function createImage($image, $relId, $name)
    {
        $arr = explode('.', $name);
        $l = count($arr);
        $ext = strtolower($arr[$l-1]);

        $im = imagecreatefromstring($image);
        $fname = $this->tmpDir.'/tmp/'.$relId.'.'.$ext;

        switch ($ext) {
            case 'png':
                imagepng($im, $fname);
                break;
            case 'bmp':
                imagebmp($im, $fname);
                break;
            case 'gif':
                imagegif($im, $fname);
                break;
            case 'jpeg':
            case 'jpg':
                imagejpeg($im, $fname);
                break;
            default:
                return null;
        }

        return $fname
            ;
    }

    /**
     * CHECKS IF ELEMENT IS AN HYPERLINK
     *
     * @param XML $xml The XML node
     * @return Array With HTML open and closing tag definition
     */
    private function getHyperlink(&$xml)
    {
        $ret = array('open'=>'<ul>','close'=>'</ul>');
        $link ='';
        if($xml->hasAttributes) {
            $attribute = "";
            while($xml->moveToNextAttribute()) {
                if($xml->name == "r:id")
                    $attribute = $xml->value;
            }

            if($attribute != "") {
                $reader = new XMLReader();
                $reader->XML($this->rels_xml->saveXML());

                while ($reader->read()) {
                    if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name=='Relationship') {
                        if($reader->getAttribute("Id") == $attribute) {
                            $link = $reader->getAttribute('Target');
                            break;
                        }
                    }
                }
            }
        }

        if($link != "") {
            $ret['open'] = "<a href='".$link."' target='_blank'>";
            $ret['close'] = "</a>";
        }

        return $ret;
    }


    /**
     * PROCESS TABLE CONTENT
     *
     * @param XML $xml The XML node
     * @return THe HTML code of the table
     */
    private function checkTableFormating(&$xml)
    {
        $table = "<table><tbody>";

        while ($xml->read()) {
            if ($xml->nodeType == XMLREADER::ELEMENT && $xml->name === 'w:tr') { //table row
                $tc = $ts = "";


                $tr = new XMLReader;
                $tr->xml(trim($xml->readOuterXML()));

                while ($tr->read()) {
                    if ($tr->nodeType == XMLREADER::ELEMENT && $tr->name === 'w:tcPr') { //table element properties
                        $ts = $this->processTableStyle(trim($tr->readOuterXML()));
                    }
                    if ($tr->nodeType == XMLREADER::ELEMENT && $tr->name === 'w:tc') { //table column
                        $tc .= $this->processTableRow(trim($tr->readOuterXML()));
                    }
                }
                $table .= '<tr style="'.$ts.'">'.$tc.'</tr>';
            }
        }

        $table .= "</tbody></table>";
        return $table;
    }

    /**
     * PROCESS THE TABLE ROW STYLE
     *
     * @param string $content The XML node content
     * @return THe HTML code of the table
     */
    private function processTableStyle($content)
    {
        /*border-collapse:collapse;
        border-bottom:4px dashed #0000FF;
        border-top:6px double #FF0000;
        border-left:5px solid #00FF00;
        border-right:5px solid #666666;*/

        $tc = new XMLReader;
        $tc->xml($content);
        $style = "border-collapse:collapse;";

        while ($tc->read()) {
            if ($tc->name === "w:tcBorders") {
                $tc2 = new SimpleXMLElement($tc->readOuterXML());

                foreach ($tc2->children('w',true) as $ch) {
                    if (in_array($ch->getName(), ['left','top','botom','right']) ) {
                        $line = $this->convertLine($ch['val']);
                        $style .= " border-".$ch->getName().":".$ch['sz']."px $line #".$ch['color'].";";
                    }
                }

                $tc->next();
            }
        }
        return $style;
    }

    private function convertLine($in)
    {
        if (in_array($in, ['dotted']))
            return "dashed";

        if (in_array($in, ['dotDash','dotdotDash','dotted','dashDotStroked','dashed','dashSmallGap']))
            return "dashed";

        if (in_array($in, ['double','triple','threeDEmboss','threeDEngrave','thick']))
            return "double";

        if (in_array($in, ['nil','none']))
            return "none";

        return "solid";
    }

    /**
     * PROCESS THE TABLE ROW
     *
     * @param string $content The XML node content
     * @return THe HTML code of the table
     */
    private function processTableRow($content)
    {
        $tc = new XMLReader;
        $tc->xml($content);
        $ct = "";

        while ($tc->read()) {
            if ($tc->name === "w:r") {
                $ct .= "<td>".$this->checkFormating($tc)."</td>";
                $tc->next();
            }
        }
        return $ct;
    }

    /**
     * READS THE GIVEN DOCX FILE INTO HTML FORMAT
     *
     * @param String $filename The DOCX file name
     * @return String With HTML code
     */
    public function readDocument($filename)
    {
        $this->file = $filename;
        $this->readZipPart($filename);
        $reader = new XMLReader();
        $reader->XML($this->doc_xml->saveXML());

        $text = ''; $list_format=[];

        $formatting['header'] = 0;
        // loop through docx xml dom
        while ($reader->read()) {
            // look for new paragraphs
            $paragraph = new XMLReader;
            $p = $reader->readOuterXML();

            if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:p') {
                // set up new instance of XMLReader for parsing paragraph independantly
                $paragraph->xml($p);

                preg_match('/<w:pStyle w:val="(Heading.*?[1-6])"/',$p,$matches);
                if(isset($matches[1])) {
                    switch($matches[1]){
                        case 'Heading1': $formatting['header'] = 1; break;
                        case 'Heading2': $formatting['header'] = 2; break;
                        case 'Heading3': $formatting['header'] = 3; break;
                        case 'Heading4': $formatting['header'] = 4; break;
                        case 'Heading5': $formatting['header'] = 5; break;
                        case 'Heading6': $formatting['header'] = 6; break;
                        default: $formatting['header'] = 0; break;
                    }
                }
                // open h-tag or paragraph
                $text .= ($formatting['header'] > 0) ? '<h'.$formatting['header'].'>' : '<p>';

                // loop through paragraph dom
                while ($paragraph->read()) {
                    // look for elements
                    if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:r') {
                        if($list_format == "")
                            $text .= $this->checkFormating($paragraph);
                        else {
                            $text .= $list_format['open'];
                            $text .= $this->checkFormating($paragraph);
                            $text .= $list_format['close'];
                        }
                        $list_format ="";
                        $paragraph->next();
                    }
                    else if($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:pPr') { //lists
                        $list_format = $this->getListFormating($paragraph);
                        $paragraph->next();
                    }
                    else if($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:drawing') { //images
                        $text .= $this->checkImageFormating($paragraph);
                        $paragraph->next();
                    }
                    else if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:hyperlink') {
                        $hyperlink = $this->getHyperlink($paragraph);
                        $text .= $hyperlink['open'];
                        $text .= $this->checkFormating($paragraph);
                        $text .= $hyperlink['close'];
                        $paragraph->next();
                    }
                }
                $text .= ($formatting['header'] > 0) ? '</h'.$formatting['header'].'>' : '</p>';
            }
            else if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:tbl') { //tables
                $paragraph->xml($p);
                $text .= $this->checkTableFormating($paragraph);
                $reader->next();
            }
        }
        $reader->close();
        if($this->debug) {
            echo "<div style='width:100%; height: 200px;'>";
            echo mb_convert_encoding($text, $this->encoding);
            echo "</div>";
        }
        return mb_convert_encoding($text, $this->encoding);
    }
}
$rt = new WordPHP();
$text = $rt->readDocument('xxx.docx');//这里要利用php读取文件下的方法遍历操作
$patt1 = '/姓  名([\w\W]*?)性 别/'; //这里要根据你表格的具体信息来操作
preg_match_all($patt1,$text,$rs);
preg_match_all('/[\x{4e00}-\x{9fa5}]+/u', $rs[1][0], $res);
var_dump($res[0]); //后续就不实现了大概思路已经整理
利用原生php方法，把docx文件内容项重复的查出来

原本想用phpword来读我的word文件的，奈何死活都不成功，也不知道是为什么，因此在互联网的大洋中寻找解决方案，终于是找到了一个解决的方案了，

思路解读

猜你喜欢

热点阅读