基于pugixml构建xpath插件
2025-12-25 本文已影响0人
LCSan
背景:之前用的c#的HtmlAgilityPack做的插件,存在严重的内存泄露问题,解析器跑的时间长了必崩。让ai给推荐基于c++的库,封装一款aardio插件,来回折腾喂了几次给做出来了。ai生成效果比预期好。
功能:基于pugixml进行xpath抽取。集成tidy进行html转xml,便于pugixml进行xpath执行。
import process.gcc;
import console;
console.log("开始编译集成 Tidy 功能的 pugixml_html.dll ...");
// --- 前置检查:确保 Tidy 库文件存在 ---
if (!io.exist("tidy-5.6.0-vc10-32b/include/tidy.h") or !io.exist("tidy-5.6.0-vc10-32b/lib/tidy.lib")) {
console.log("❌ 错误:未找到 Tidy 库文件!");
console.log("请确保当前目录下存在 'tidy-5.6.0-vc10-32b' 文件夹。");
console.pause();
return;
}
// --- 创建 GCC 对象 ---
var gcc = process.gcc("/");
// --- 定义集成了 Tidy 功能的 C++ 包装器 ---
gcc["pugixml_html_wrapper.cpp"] = /*************
#include <windows.h>
#include <string>
#include <vector>
#include <sstream>
#include <algorithm>
#include "pugixml.hpp"
#include "tidy.h"
#include "tidybuffio.h"
// --- 新增:全局临界区,用于保护对 Tidy 库的调用 ---
CRITICAL_SECTION g_csTidy;
// --- 新增:DLL 入口点,用于管理临界区的生命周期 ---
BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) {
switch (ul_reason_for_call) {
case DLL_PROCESS_ATTACH:
InitializeCriticalSection(&g_csTidy);
break;
case DLL_PROCESS_DETACH:
DeleteCriticalSection(&g_csTidy);
break;
}
return TRUE;
}
// --- 核心:使用 Tidy 库将 HTML 转换为严格的 XML (已移除不必要的预处理) ---
extern "C" __declspec(dllexport) void __cdecl TidyHtml(const char* htmlString, char* buffer, int bufferSize) {
if (!htmlString || !buffer || bufferSize <= 0) {
strncpy_s(buffer, bufferSize, "ERROR: TidyHtml: 无效的输入参数", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
return;
}
EnterCriticalSection(&g_csTidy);
// --- 关键修改:不再进行预处理,直接使用原始HTML ---
// 因为您已确认 Tidy 能正确处理 <DJ> 这类字符串
TidyDoc tdoc = tidyCreate();
if (!tdoc) {
strncpy_s(buffer, bufferSize, "ERROR: TidyHtml: 无法创建 TidyDoc", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
LeaveCriticalSection(&g_csTidy);
return;
}
// 配置 Tidy 选项
tidyOptSetBool(tdoc, TidyXmlOut, yes);
tidyOptSetBool(tdoc, TidyXhtmlOut, no);
tidyOptParseValue(tdoc, "doctype", "omit");
tidyOptSetBool(tdoc, TidyMark, no);
tidyOptSetBool(tdoc, TidyIndentContent, no);
tidyOptSetBool(tdoc, TidyDropEmptyParas, yes);
tidyOptSetBool(tdoc, TidyFixUri, yes); // 必要时应用 URI 编码。
tidyOptSetBool(tdoc, TidyForceOutput, yes); // 即使发现错误也强制输出文档。至关重要
tidyOptSetInt(tdoc, TidyMergeDivs, no);
tidyOptSetInt(tdoc, TidyMergeSpans, no);
tidyOptSetBool(tdoc, TidyLogicalEmphasis, yes);
TidyBuffer output = {0};
// --- 关键修改:直接使用原始 htmlString ---
int err = tidyParseString(tdoc, htmlString);
if (err >= 0) {
err = tidyCleanAndRepair(tdoc);
}
if (err >= 0) {
err = tidySaveBuffer(tdoc, &output);
}
if (err < 0) {
strncpy_s(buffer, bufferSize, "ERROR: TidyHtml: 处理HTML时发生错误,请检查输入是否为有效的HTML片段。", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
} else if (output.bp) {
if (output.size >= bufferSize) {
_snprintf_s(buffer, bufferSize, _TRUNCATE, "ERROR: TidyHtml: 处理结果过大,缓冲区不足(需要%d字节,提供%d字节)。", output.size, bufferSize);
} else {
strncpy_s(buffer, bufferSize, (char*)output.bp, bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
} else {
strncpy_s(buffer, bufferSize, "", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
tidyBufFree(&output);
tidyRelease(tdoc);
LeaveCriticalSection(&g_csTidy);
}
extern "C" __declspec(dllexport) void* __cdecl LoadHtml(const char* htmlString) {
if (!htmlString) return nullptr;
pugi::xml_document* doc = new pugi::xml_document();
if (!doc) return nullptr;
// 1. 增加初始缓冲区大小(5倍 + 安全边界)
size_t inputLen = strlen(htmlString);
size_t dynamicBufferSize = inputLen * 5 + 1024; // 5倍大小 + 1KB安全边界
// 2. 设置最大缓冲区限制(10MB)
const size_t maxBufferSize = 10 * 1024 * 1024;
while (dynamicBufferSize <= maxBufferSize) {
try {
std::vector<char> tidyBuffer(dynamicBufferSize);
// 3. 调用TidyHtml处理
TidyHtml(htmlString, tidyBuffer.data(), tidyBuffer.size());
// 4. 检查错误
if (strncmp(tidyBuffer.data(), "ERROR:", 6) == 0) {
// 如果是缓冲区不足错误,扩大缓冲区重试
if (strstr(tidyBuffer.data(), "缓冲区不足") != nullptr) {
dynamicBufferSize *= 2; // 缓冲区翻倍
continue;
}
delete doc;
return nullptr;
}
// 5. 解析XML
pugi::xml_parse_result result = doc->load_string(
tidyBuffer.data(),
pugi::parse_default | pugi::parse_ws_pcdata | pugi::parse_escapes
);
return result ? doc : nullptr;
} catch (const std::bad_alloc&) {
delete doc;
return nullptr;
}
}
delete doc;
return nullptr;
}
// 导出函数:执行XPath查询(返回多个结果)
extern "C" __declspec(dllexport) void __cdecl XPathSelect(void* docPtr, const char* xpath, char* buffer, int bufferSize) {
if (buffer && bufferSize > 0) {
buffer[0] = '\0';
}
if (!docPtr) {
strncpy_s(buffer, bufferSize, "ERROR: 文档指针为空", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
return;
}
if (!xpath) {
strncpy_s(buffer, bufferSize, "ERROR: XPath表达式为空", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
return;
}
try {
pugi::xml_document* doc = static_cast<pugi::xml_document*>(docPtr);
pugi::xpath_node_set nodes = doc->select_nodes(xpath);
int pos = 0;
for (pugi::xpath_node node : nodes) {
std::string value;
if (node.attribute()) {
value = node.attribute().value();
}
else if (node.node()) {
pugi::xml_node n = node.node();
if (n.type() == pugi::node_element) {
std::ostringstream oss;
n.print(oss, "", pugi::format_raw, pugi::encoding_utf8);
value = oss.str();
std::string::size_type pos_clean = 0;
while ((pos_clean = value.find('\n', pos_clean)) != std::string::npos) {
value.erase(pos_clean, 1);
}
pos_clean = 0;
while ((pos_clean = value.find(" ", pos_clean)) != std::string::npos) {
value.erase(pos_clean, 1);
}
pos_clean = 0;
while ((pos_clean = value.find("> <", pos_clean)) != std::string::npos) {
value.replace(pos_clean, 3, "><");
}
}
else if (n.type() == pugi::node_pcdata || n.type() == pugi::node_cdata) {
value = n.value();
}
else if (n.type() == pugi::node_comment) {
value = "<!--";
value += n.value();
value += "-->";
}
else {
value = n.value();
}
}
if (!value.empty()) {
int len = value.length();
if (pos + len + 2 < bufferSize) {
strcpy_s(buffer + pos, bufferSize - pos, value.c_str());
pos += len;
buffer[pos] = '\n';
pos++;
buffer[pos] = '\0';
}
}
}
if (pos > 0 && buffer[pos-1] == '\n') {
buffer[pos-1] = '\0';
}
}
catch (const std::exception& e) {
std::string errMsg = "ERROR: ";
errMsg += e.what();
strncpy_s(buffer, bufferSize, errMsg.c_str(), bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
catch (...) {
strncpy_s(buffer, bufferSize, "ERROR: XPath查询异常", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
}
// 导出函数:执行XPath查询(只返回第一个匹配的节点)
extern "C" __declspec(dllexport) void __cdecl XPathSelectSingle(void* docPtr, const char* xpath, char* buffer, int bufferSize) {
if (buffer && bufferSize > 0) {
buffer[0] = '\0';
}
if (!docPtr) {
strncpy_s(buffer, bufferSize, "ERROR: 文档指针为空", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
return;
}
if (!xpath) {
strncpy_s(buffer, bufferSize, "ERROR: XPath表达式为空", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
return;
}
try {
pugi::xml_document* doc = static_cast<pugi::xml_document*>(docPtr);
pugi::xpath_node node = doc->select_node(xpath);
if (node) {
std::string value;
if (node.attribute()) {
value = node.attribute().value();
}
else if (node.node()) {
pugi::xml_node n = node.node();
if (n.type() == pugi::node_element) {
std::ostringstream oss;
n.print(oss, "", pugi::format_raw, pugi::encoding_utf8);
value = oss.str();
std::string::size_type pos_clean = 0;
while ((pos_clean = value.find('\n', pos_clean)) != std::string::npos) {
value.erase(pos_clean, 1);
}
pos_clean = 0;
while ((pos_clean = value.find(" ", pos_clean)) != std::string::npos) {
value.erase(pos_clean, 1);
}
pos_clean = 0;
while ((pos_clean = value.find("> <", pos_clean)) != std::string::npos) {
value.replace(pos_clean, 3, "><");
}
}
else {
value = n.value();
}
}
if (!value.empty()) {
strncpy_s(buffer, bufferSize, value.c_str(), bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
}
}
catch (const std::exception& e) {
std::string errMsg = "ERROR: ";
errMsg += e.what();
strncpy_s(buffer, bufferSize, errMsg.c_str(), bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
catch (...) {
strncpy_s(buffer, bufferSize, "ERROR: XPath查询异常", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
}
// 导出函数:获取节点的纯文本内容
extern "C" __declspec(dllexport) void __cdecl GetTextContent(void* docPtr, const char* xpath, char* buffer, int bufferSize) {
if (buffer && bufferSize > 0) {
buffer[0] = '\0';
}
if (!docPtr) {
strncpy_s(buffer, bufferSize, "ERROR: 文档指针为空", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
return;
}
if (!xpath) {
strncpy_s(buffer, bufferSize, "ERROR: XPath表达式为空", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
return;
}
try {
pugi::xml_document* doc = static_cast<pugi::xml_document*>(docPtr);
pugi::xpath_node node = doc->select_node(xpath);
if (node && node.node()) {
std::string text = node.node().text().get();
strncpy_s(buffer, bufferSize, text.c_str(), bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
}
catch (const std::exception& e) {
std::string errMsg = "ERROR: ";
errMsg += e.what();
strncpy_s(buffer, bufferSize, errMsg.c_str(), bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
catch (...) {
strncpy_s(buffer, bufferSize, "ERROR: 获取文本内容异常", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
}
// 导出函数:获取节点属性值
extern "C" __declspec(dllexport) void __cdecl GetNodeAttr(void* docPtr, const char* xpath, const char* attrName, char* buffer, int bufferSize) {
if (buffer && bufferSize > 0) {
buffer[0] = '\0';
}
if (!docPtr) {
strncpy_s(buffer, bufferSize, "ERROR: 文档指针为空", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
return;
}
if (!xpath || !attrName) {
strncpy_s(buffer, bufferSize, "ERROR: XPath表达式或属性名为空", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
return;
}
try {
pugi::xml_document* doc = static_cast<pugi::xml_document*>(docPtr);
pugi::xpath_node node = doc->select_node(xpath);
if (node) {
pugi::xml_attribute attr;
if (node.attribute()) {
attr = node.attribute();
} else if (node.node()) {
attr = node.node().attribute(attrName);
}
if (attr) {
strncpy_s(buffer, bufferSize, attr.value(), bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
}
}
catch (const std::exception& e) {
std::string errMsg = "ERROR: ";
errMsg += e.what();
strncpy_s(buffer, bufferSize, errMsg.c_str(), bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
catch (...) {
strncpy_s(buffer, bufferSize, "ERROR: 获取节点属性异常", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
}
// 导出函数:获取节点名称
extern "C" __declspec(dllexport) void __cdecl GetNodeName(void* docPtr, const char* xpath, char* buffer, int bufferSize) {
if (buffer && bufferSize > 0) {
buffer[0] = '\0';
}
if (!docPtr) {
strncpy_s(buffer, bufferSize, "ERROR: 文档指针为空", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
return;
}
if (!xpath) {
strncpy_s(buffer, bufferSize, "ERROR: XPath表达式为空", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
return;
}
try {
pugi::xml_document* doc = static_cast<pugi::xml_document*>(docPtr);
pugi::xpath_node node = doc->select_node(xpath);
if (node) {
const char* name = nullptr;
if (node.attribute()) {
name = node.attribute().name();
} else if (node.node()) {
name = node.node().name();
}
if (name) {
strncpy_s(buffer, bufferSize, name, bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
}
}
catch (const std::exception& e) {
std::string errMsg = "ERROR: ";
errMsg += e.what();
strncpy_s(buffer, bufferSize, errMsg.c_str(), bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
catch (...) {
strncpy_s(buffer, bufferSize, "ERROR: 获取节点名称异常", bufferSize - 1);
buffer[bufferSize - 1] = '\0';
}
}
// 导出函数:释放文档
extern "C" __declspec(dllexport) void __cdecl FreeXml(void* docPtr) {
if (docPtr) {
pugi::xml_document* doc = static_cast<pugi::xml_document*>(docPtr);
delete doc;
}
}
// 导出函数:加载XML字符串
extern "C" __declspec(dllexport) void* __cdecl LoadXml(const char* xmlString) {
if (!xmlString) {
return nullptr;
}
pugi::xml_document* doc = new pugi::xml_document();
if (!doc) {
return nullptr;
}
pugi::xml_parse_result result = doc->load_string(xmlString,
pugi::parse_default | pugi::parse_ws_pcdata | pugi::parse_escapes);
if (!result) {
delete doc;
return nullptr;
}
return doc;
}
*************/
// --- 构建最终的编译命令 ---
var compileParams = 'g++ pugixml_html_wrapper.cpp pugixml.cpp -o pugixml_html.dll -shared -s -m32 -static -L"tidy-5.6.0-vc10-32b/lib" -I"tidy-5.6.0-vc10-32b/include" -ltidy -DTIDY_MAX_ATTRIBUTE_LENGTH=8192 -DTIDY_MAX_INPUT_BUFFER=1048576';
console.log("执行编译命令:");
console.log(compileParams);
console.log("");
// --- 执行编译 ---
gcc.exec(compileParams);
// --- 检查结果 ---
if(io.exist("/pugixml_html.dll")) {
console.log("\n✅ 编译成功: pugixml_html.dll");
console.log("此 DLL 已集成 Pugixml 和 Tidy 功能,已修复线程安全和缓冲区问题。");
} else {
console.log("\n❌ 编译失败,请检查上方的错误信息。");
}
console.pause();