aardio

基于pugixml构建xpath插件

2025-12-25  本文已影响0人  LCSan

背景:之前用的c#的HtmlAgilityPack做的插件,存在严重的内存泄露问题,解析器跑的时间长了必崩。让ai给推荐基于c++的库,封装一款aardio插件,来回折腾喂了几次给做出来了。ai生成效果比预期好。
功能:基于pugixml进行xpath抽取。集成tidy进行html转xml,便于pugixml进行xpath执行。

import process.gcc;
import console;

console.log("开始编译集成 Tidy 功能的 pugixml_html.dll ...");

// --- 前置检查:确保 Tidy 库文件存在 ---
if (!io.exist("tidy-5.6.0-vc10-32b/include/tidy.h") or !io.exist("tidy-5.6.0-vc10-32b/lib/tidy.lib")) {
    console.log("❌ 错误:未找到 Tidy 库文件!");
    console.log("请确保当前目录下存在 'tidy-5.6.0-vc10-32b' 文件夹。");
    console.pause();
    return;
}

// --- 创建 GCC 对象 ---
var gcc = process.gcc("/");

// --- 定义集成了 Tidy 功能的 C++ 包装器 ---
gcc["pugixml_html_wrapper.cpp"] = /*************
#include <windows.h>
#include <string>
#include <vector>
#include <sstream>
#include <algorithm>
#include "pugixml.hpp"
#include "tidy.h"
#include "tidybuffio.h"

// --- 新增:全局临界区,用于保护对 Tidy 库的调用 ---
CRITICAL_SECTION g_csTidy;

// --- 新增:DLL 入口点,用于管理临界区的生命周期 ---
BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) {
    switch (ul_reason_for_call) {
    case DLL_PROCESS_ATTACH:
        InitializeCriticalSection(&g_csTidy);
        break;
    case DLL_PROCESS_DETACH:
        DeleteCriticalSection(&g_csTidy);
        break;
    }
    return TRUE;
}

// --- 核心:使用 Tidy 库将 HTML 转换为严格的 XML (已移除不必要的预处理) ---
extern "C" __declspec(dllexport) void __cdecl TidyHtml(const char* htmlString, char* buffer, int bufferSize) {
    if (!htmlString || !buffer || bufferSize <= 0) {
        strncpy_s(buffer, bufferSize, "ERROR: TidyHtml: 无效的输入参数", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
        return;
    }

    EnterCriticalSection(&g_csTidy);

    // --- 关键修改:不再进行预处理,直接使用原始HTML ---
    // 因为您已确认 Tidy 能正确处理 <DJ> 这类字符串

    TidyDoc tdoc = tidyCreate();
    if (!tdoc) {
        strncpy_s(buffer, bufferSize, "ERROR: TidyHtml: 无法创建 TidyDoc", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
        LeaveCriticalSection(&g_csTidy);
        return;
    }

    // 配置 Tidy 选项
    tidyOptSetBool(tdoc, TidyXmlOut, yes);
    tidyOptSetBool(tdoc, TidyXhtmlOut, no);
    tidyOptParseValue(tdoc, "doctype", "omit");
    tidyOptSetBool(tdoc, TidyMark, no);
    tidyOptSetBool(tdoc, TidyIndentContent, no);
    tidyOptSetBool(tdoc, TidyDropEmptyParas, yes);
    tidyOptSetBool(tdoc, TidyFixUri, yes); // 必要时应用 URI 编码。
    tidyOptSetBool(tdoc, TidyForceOutput, yes); // 即使发现错误也强制输出文档。至关重要
    tidyOptSetInt(tdoc, TidyMergeDivs, no);
    tidyOptSetInt(tdoc, TidyMergeSpans, no);
    tidyOptSetBool(tdoc, TidyLogicalEmphasis, yes);

    TidyBuffer output = {0};
    // --- 关键修改:直接使用原始 htmlString ---
    int err = tidyParseString(tdoc, htmlString);
    if (err >= 0) {
        err = tidyCleanAndRepair(tdoc);
    }
    if (err >= 0) {
        err = tidySaveBuffer(tdoc, &output);
    }

    if (err < 0) {
        strncpy_s(buffer, bufferSize, "ERROR: TidyHtml: 处理HTML时发生错误,请检查输入是否为有效的HTML片段。", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
    } else if (output.bp) {
        if (output.size >= bufferSize) {
            _snprintf_s(buffer, bufferSize, _TRUNCATE, "ERROR: TidyHtml: 处理结果过大,缓冲区不足(需要%d字节,提供%d字节)。", output.size, bufferSize);
        } else {
            strncpy_s(buffer, bufferSize, (char*)output.bp, bufferSize - 1);
            buffer[bufferSize - 1] = '\0';
        }
    } else {
        strncpy_s(buffer, bufferSize, "", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
    }

    tidyBufFree(&output);
    tidyRelease(tdoc);

    LeaveCriticalSection(&g_csTidy);
}

extern "C" __declspec(dllexport) void* __cdecl LoadHtml(const char* htmlString) {
    if (!htmlString) return nullptr;
    
    pugi::xml_document* doc = new pugi::xml_document();
    if (!doc) return nullptr;
    
    // 1. 增加初始缓冲区大小(5倍 + 安全边界)
    size_t inputLen = strlen(htmlString);
    size_t dynamicBufferSize = inputLen * 5 + 1024; // 5倍大小 + 1KB安全边界
    
    // 2. 设置最大缓冲区限制(10MB)
    const size_t maxBufferSize = 10 * 1024 * 1024;
    
    while (dynamicBufferSize <= maxBufferSize) {
        try {
            std::vector<char> tidyBuffer(dynamicBufferSize);
            
            // 3. 调用TidyHtml处理
            TidyHtml(htmlString, tidyBuffer.data(), tidyBuffer.size());
            
            // 4. 检查错误
            if (strncmp(tidyBuffer.data(), "ERROR:", 6) == 0) {
                // 如果是缓冲区不足错误,扩大缓冲区重试
                if (strstr(tidyBuffer.data(), "缓冲区不足") != nullptr) {
                    dynamicBufferSize *= 2; // 缓冲区翻倍
                    continue;
                }
                delete doc;
                return nullptr;
            }
            
            // 5. 解析XML
            pugi::xml_parse_result result = doc->load_string(
                tidyBuffer.data(),
                pugi::parse_default | pugi::parse_ws_pcdata | pugi::parse_escapes
            );
            
            return result ? doc : nullptr;
            
        } catch (const std::bad_alloc&) {
            delete doc;
            return nullptr;
        }
    }
    
    delete doc;
    return nullptr;
}

// 导出函数:执行XPath查询(返回多个结果)
extern "C" __declspec(dllexport) void __cdecl XPathSelect(void* docPtr, const char* xpath, char* buffer, int bufferSize) {
    if (buffer && bufferSize > 0) {
        buffer[0] = '\0';
    }
    
    if (!docPtr) {
        strncpy_s(buffer, bufferSize, "ERROR: 文档指针为空", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
        return;
    }
    
    if (!xpath) {
        strncpy_s(buffer, bufferSize, "ERROR: XPath表达式为空", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
        return;
    }
    
    try {
        pugi::xml_document* doc = static_cast<pugi::xml_document*>(docPtr);
        
        pugi::xpath_node_set nodes = doc->select_nodes(xpath);
        
        int pos = 0;
        for (pugi::xpath_node node : nodes) {
            std::string value;
            
            if (node.attribute()) {
                value = node.attribute().value();
            }
            else if (node.node()) {
                pugi::xml_node n = node.node();
                if (n.type() == pugi::node_element) {
                    std::ostringstream oss;
                    n.print(oss, "", pugi::format_raw, pugi::encoding_utf8);
                    value = oss.str();
                    std::string::size_type pos_clean = 0;
                    while ((pos_clean = value.find('\n', pos_clean)) != std::string::npos) {
                        value.erase(pos_clean, 1);
                    }
                    pos_clean = 0;
                    while ((pos_clean = value.find("  ", pos_clean)) != std::string::npos) {
                        value.erase(pos_clean, 1);
                    }
                    pos_clean = 0;
                    while ((pos_clean = value.find("> <", pos_clean)) != std::string::npos) {
                        value.replace(pos_clean, 3, "><");
                    }
                } 
                else if (n.type() == pugi::node_pcdata || n.type() == pugi::node_cdata) {
                    value = n.value();
                } 
                else if (n.type() == pugi::node_comment) {
                    value = "<!--";
                    value += n.value();
                    value += "-->";
                }
                else {
                    value = n.value();
                }
            }
            
            if (!value.empty()) {
                int len = value.length();
                if (pos + len + 2 < bufferSize) {
                    strcpy_s(buffer + pos, bufferSize - pos, value.c_str());
                    pos += len;
                    buffer[pos] = '\n';
                    pos++;
                    buffer[pos] = '\0';
                }
            }
        }
        
        if (pos > 0 && buffer[pos-1] == '\n') {
            buffer[pos-1] = '\0';
        }
    }
    catch (const std::exception& e) {
        std::string errMsg = "ERROR: ";
        errMsg += e.what();
        strncpy_s(buffer, bufferSize, errMsg.c_str(), bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
    }
    catch (...) {
        strncpy_s(buffer, bufferSize, "ERROR: XPath查询异常", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
    }
}

// 导出函数:执行XPath查询(只返回第一个匹配的节点)
extern "C" __declspec(dllexport) void __cdecl XPathSelectSingle(void* docPtr, const char* xpath, char* buffer, int bufferSize) {
    if (buffer && bufferSize > 0) {
        buffer[0] = '\0';
    }
    
    if (!docPtr) {
        strncpy_s(buffer, bufferSize, "ERROR: 文档指针为空", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
        return;
    }
    
    if (!xpath) {
        strncpy_s(buffer, bufferSize, "ERROR: XPath表达式为空", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
        return;
    }
    
    try {
        pugi::xml_document* doc = static_cast<pugi::xml_document*>(docPtr);
        pugi::xpath_node node = doc->select_node(xpath);
        
        if (node) {
            std::string value;
            if (node.attribute()) {
                value = node.attribute().value();
            }
            else if (node.node()) {
                pugi::xml_node n = node.node();
                if (n.type() == pugi::node_element) {
                    std::ostringstream oss;
                    n.print(oss, "", pugi::format_raw, pugi::encoding_utf8);
                    value = oss.str();
                    std::string::size_type pos_clean = 0;
                    while ((pos_clean = value.find('\n', pos_clean)) != std::string::npos) {
                        value.erase(pos_clean, 1);
                    }
                    pos_clean = 0;
                    while ((pos_clean = value.find("  ", pos_clean)) != std::string::npos) {
                        value.erase(pos_clean, 1);
                    }
                    pos_clean = 0;
                    while ((pos_clean = value.find("> <", pos_clean)) != std::string::npos) {
                        value.replace(pos_clean, 3, "><");
                    }
                }
                else {
                    value = n.value();
                }
            }
            
            if (!value.empty()) {
                strncpy_s(buffer, bufferSize, value.c_str(), bufferSize - 1);
                buffer[bufferSize - 1] = '\0';
            }
        }
    }
    catch (const std::exception& e) {
        std::string errMsg = "ERROR: ";
        errMsg += e.what();
        strncpy_s(buffer, bufferSize, errMsg.c_str(), bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
    }
    catch (...) {
        strncpy_s(buffer, bufferSize, "ERROR: XPath查询异常", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
    }
}

// 导出函数:获取节点的纯文本内容
extern "C" __declspec(dllexport) void __cdecl GetTextContent(void* docPtr, const char* xpath, char* buffer, int bufferSize) {
    if (buffer && bufferSize > 0) {
        buffer[0] = '\0';
    }
    
    if (!docPtr) {
        strncpy_s(buffer, bufferSize, "ERROR: 文档指针为空", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
        return;
    }
    
    if (!xpath) {
        strncpy_s(buffer, bufferSize, "ERROR: XPath表达式为空", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
        return;
    }
    
    try {
        pugi::xml_document* doc = static_cast<pugi::xml_document*>(docPtr);
        pugi::xpath_node node = doc->select_node(xpath);
        
        if (node && node.node()) {
            std::string text = node.node().text().get();
            strncpy_s(buffer, bufferSize, text.c_str(), bufferSize - 1);
            buffer[bufferSize - 1] = '\0';
        }
    }
    catch (const std::exception& e) {
        std::string errMsg = "ERROR: ";
        errMsg += e.what();
        strncpy_s(buffer, bufferSize, errMsg.c_str(), bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
    }
    catch (...) {
        strncpy_s(buffer, bufferSize, "ERROR: 获取文本内容异常", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
    }
}

// 导出函数:获取节点属性值
extern "C" __declspec(dllexport) void __cdecl GetNodeAttr(void* docPtr, const char* xpath, const char* attrName, char* buffer, int bufferSize) {
    if (buffer && bufferSize > 0) {
        buffer[0] = '\0';
    }
    
    if (!docPtr) {
        strncpy_s(buffer, bufferSize, "ERROR: 文档指针为空", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
        return;
    }
    
    if (!xpath || !attrName) {
        strncpy_s(buffer, bufferSize, "ERROR: XPath表达式或属性名为空", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
        return;
    }
    
    try {
        pugi::xml_document* doc = static_cast<pugi::xml_document*>(docPtr);
        pugi::xpath_node node = doc->select_node(xpath);
        
        if (node) {
            pugi::xml_attribute attr;
            if (node.attribute()) {
                attr = node.attribute();
            } else if (node.node()) {
                attr = node.node().attribute(attrName);
            }
            
            if (attr) {
                strncpy_s(buffer, bufferSize, attr.value(), bufferSize - 1);
                buffer[bufferSize - 1] = '\0';
            }
        }
    }
    catch (const std::exception& e) {
        std::string errMsg = "ERROR: ";
        errMsg += e.what();
        strncpy_s(buffer, bufferSize, errMsg.c_str(), bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
    }
    catch (...) {
        strncpy_s(buffer, bufferSize, "ERROR: 获取节点属性异常", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
    }
}

// 导出函数:获取节点名称
extern "C" __declspec(dllexport) void __cdecl GetNodeName(void* docPtr, const char* xpath, char* buffer, int bufferSize) {
    if (buffer && bufferSize > 0) {
        buffer[0] = '\0';
    }
    
    if (!docPtr) {
        strncpy_s(buffer, bufferSize, "ERROR: 文档指针为空", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
        return;
    }
    
    if (!xpath) {
        strncpy_s(buffer, bufferSize, "ERROR: XPath表达式为空", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
        return;
    }
    
    try {
        pugi::xml_document* doc = static_cast<pugi::xml_document*>(docPtr);
        pugi::xpath_node node = doc->select_node(xpath);
        
        if (node) {
            const char* name = nullptr;
            if (node.attribute()) {
                name = node.attribute().name();
            } else if (node.node()) {
                name = node.node().name();
            }
            
            if (name) {
                strncpy_s(buffer, bufferSize, name, bufferSize - 1);
                buffer[bufferSize - 1] = '\0';
            }
        }
    }
    catch (const std::exception& e) {
        std::string errMsg = "ERROR: ";
        errMsg += e.what();
        strncpy_s(buffer, bufferSize, errMsg.c_str(), bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
    }
    catch (...) {
        strncpy_s(buffer, bufferSize, "ERROR: 获取节点名称异常", bufferSize - 1);
        buffer[bufferSize - 1] = '\0';
    }
}

// 导出函数:释放文档
extern "C" __declspec(dllexport) void __cdecl FreeXml(void* docPtr) {
    if (docPtr) {
        pugi::xml_document* doc = static_cast<pugi::xml_document*>(docPtr);
        delete doc;
    }
}

// 导出函数:加载XML字符串
extern "C" __declspec(dllexport) void* __cdecl LoadXml(const char* xmlString) {
    if (!xmlString) {
        return nullptr;
    }
    
    pugi::xml_document* doc = new pugi::xml_document();
    if (!doc) {
        return nullptr;
    }
    
    pugi::xml_parse_result result = doc->load_string(xmlString, 
        pugi::parse_default | pugi::parse_ws_pcdata | pugi::parse_escapes);
    
    if (!result) {
        delete doc;
        return nullptr;
    }
    
    return doc;
}

*************/

// --- 构建最终的编译命令 ---
var compileParams = 'g++ pugixml_html_wrapper.cpp pugixml.cpp -o pugixml_html.dll -shared -s -m32 -static -L"tidy-5.6.0-vc10-32b/lib" -I"tidy-5.6.0-vc10-32b/include" -ltidy -DTIDY_MAX_ATTRIBUTE_LENGTH=8192 -DTIDY_MAX_INPUT_BUFFER=1048576';

console.log("执行编译命令:");
console.log(compileParams);
console.log("");

// --- 执行编译 ---
gcc.exec(compileParams);

// --- 检查结果 ---
if(io.exist("/pugixml_html.dll")) {
    console.log("\n✅ 编译成功: pugixml_html.dll");
    console.log("此 DLL 已集成 Pugixml 和 Tidy 功能,已修复线程安全和缓冲区问题。");
} else {
    console.log("\n❌ 编译失败,请检查上方的错误信息。");
}
console.pause();

上一篇 下一篇

猜你喜欢

热点阅读