安卓基础篇之SAX解析XML文件

2023-02-08  本文已影响0人  小城哇哇

SAX解析XML文件

基本使用方法

import java.io.IOException;
import java.util.ArrayList;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import com.garlick.xml.decode.Decode;

public class SaxXmlDecode extends Decode {
    public void decode() {
        SAXParserFactory factory = SAXParserFactory.newInstance();
        try {
            factory.newSAXParser().parse(COMPANY_FILE_NAME, new MyHandler());
        } catch (IOException | SAXException | ParserConfigurationException e) {
            e.printStackTrace();
        }
    }
    
    private class MyHandler extends DefaultHandler {
        private ArrayList<Group> groups;
        private Group group;
        private boolean staff = false;
        
        @Override
        public void characters(char[] ch, int start, int length) throws SAXException {
            super.characters(ch, start, length);
            if (staff && (group != null) && group.staffs != null) {
                group.staffs.add(new String(ch, start, length));
            }
        }
        
        @Override
        public void endDocument() throws SAXException {
            super.endDocument();
            print(groups);
        }
        
        @Override
        public void startDocument() throws SAXException {
            super.startDocument();
            groups = new ArrayList<Group>();
        }
        
        @Override
        public void startElement(String uri, String localName, String qName, Attributes attributes)
                throws SAXException {
            super.startElement(uri, localName, qName, attributes);
            if (GROUP_ELEMENT_TAG_NAME.equals(qName)) {
                group = new Group();
            } else if (LEADER_ELEMENT_TAG_NAME.equals(qName)) {
                if (group != null) {
                    if (group.leaders == null) {
                        group.leaders = new ArrayList<String>();
                    }
                    if (attributes.getValue("name") != null) {
                        group.leaders.add(attributes.getValue("name"));
                    }
                }
                
            } else if (STAFF_ELEMENT_TAG_NAME.equals(qName)) {
                if (group != null && group.staffs == null) {
                    group.staffs = new ArrayList<String>();
                }
                staff = true;
            }
        }
        
        @Override
        public void endElement(String uri, String localName, String qName) throws SAXException {
            super.endElement(uri, localName, qName);
            if (GROUP_ELEMENT_TAG_NAME.equals(qName)) {
                if (group != null) {
                    groups.add(group);
                }
            } else if (STAFF_ELEMENT_TAG_NAME.equals(qName)) {
                staff = false;
            }
        }
    }
    
    private class Group {
        ArrayList<String> leaders;
        ArrayList<String> staffs;
    }
    
    private void print(ArrayList<Group> groups) {
        if (groups != null && groups.size() > 0) {
            System.out.println(COMPANY_ELEMENT_TAG_NAME);
            for (int index = 0; index < groups.size(); index++) {
                System.out.println("\t" + GROUP_ELEMENT_TAG_NAME + " " + (index + 1));
                Group group = groups.get(index);
                if (group.leaders != null && group.leaders.size() > 0) {
                    for (String leader : group.leaders) {
                        System.out.println("\t\t" + LEADER_ELEMENT_TAG_NAME + ":\t" + leader);
                    }
                }
                if (group.staffs != null && group.staffs.size() > 0) {
                    for (String staff : group.staffs) {
                        System.out.println("\t\t" + STAFF_ELEMENT_TAG_NAME + ":\t" + staff);
                    }
                }
            }
        }
    }
} 

详细源码解析

SAXParserImpl对象的初始化

在使用SAX解析XML文件的过程中,首先,先通过其newInstance函数初始化一个SAXParserFactory对象,

public static SAXParserFactory newInstance() {
    // instantiate the class directly rather than using reflection
    // 初始化一个SAXPareserFactoryImpl对象
    return new SAXParserFactoryImpl();
} 

直接new一个SAXParserFactory的子类SAXParserFactoryImpl对象 然后,调用其newSAXParser函数

@Override
public SAXParser newSAXParser() throws ParserConfigurationException {
    // ...... 条件判断,分支无法进入,省略
    try {
        return new SAXParserImpl(features);
    } 
    // ...... catch exception, code delete
} 

也就是说,在这边直接初始化一个SAXParserImpl对象

SAXParserImpl(Map<String, Boolean> initialFeatures)
        throws SAXNotRecognizedException, SAXNotSupportedException {
    this.initialFeatures = initialFeatures.isEmpty()
            ? Collections.<String, Boolean>emptyMap()
            : new HashMap<String, Boolean>(initialFeatures);
    resetInternal();
}

private void resetInternal()
        throws SAXNotSupportedException, SAXNotRecognizedException {
    reader = new ExpatReader();
    for (Map.Entry<String,Boolean> entry : initialFeatures.entrySet()) {
        reader.setFeature(entry.getKey(), entry.getValue());
    }
} 

解析XML文件

public void parse(String uri, DefaultHandler dh)
    throws SAXException, IOException {
    // ...... 判空条件判断代码省略
    InputSource input = new InputSource(uri);
    this.parse(input, dh);
} 

初始化InputSource对象,然后将其作为参数,调用重载函数parse

public void parse(InputSource is, DefaultHandler dh)
    throws SAXException, IOException {
    // ...... 判空条件判断代码省略
    // 获取XMLReader对象
    XMLReader reader = this.getXMLReader();
    if (dh != null) {
        reader.setContentHandler(dh);
        reader.setEntityResolver(dh);
        reader.setErrorHandler(dh);
        reader.setDTDHandler(dh);
    }
    reader.parse(is);
} 

reader为刚刚在SAXParserImpl初始化过程中,初始化的一个ExpatReader对象,因此直接调用ExpatReader的parse函数

public void parse(InputSource input) throws IOException, SAXException {
    // ...... 判空条件判断代码省略
    Reader reader = input.getCharacterStream();
    if (reader != null) {
        try {
            parse(reader, input.getPublicId(), input.getSystemId());
        }
        // ......
        return;
    }

    // Try the byte stream.
    InputStream in = input.getByteStream();
    String encoding = input.getEncoding();
    // null
    if (in != null) {
        try {
            parse(in, encoding, input.getPublicId(), input.getSystemId());
        }
        // ......
        return;
    }

    String systemId = input.getSystemId();
    // ......
    // Try the system id.
    // 创建URLConnection,然后调用重载函数
    in = ExpatParser.openUrl(systemId);
    try {
        parse(in, encoding, input.getPublicId(), systemId);
    } finally {
        IoUtils.closeQuietly(in);
    }
} 

从上述代码看,这边创建了一个URLConnection,然后调用重载parse函数

private void parse(InputStream in, String charsetName, String publicId, String systemId)
        throws IOException, SAXException {
    // 初始化ExpatParser对象
    ExpatParser parser = new ExpatParser(charsetName, this, processNamespaces, publicId, systemId);
    parser.parseDocument(in);
} 

初始化一个ExpatParser对象,然后调用其parseDocument函数 a. 初始化ExpatParser对象 这边使用了new直接初始化ExpatParser

/*package*/ ExpatParser(String encoding, ExpatReader xmlReader,
        boolean processNamespaces, String publicId, String systemId) {
    // ......
    this.encoding = encoding == null ? DEFAULT_ENCODING : encoding;
    this.pointer = initialize(
        this.encoding,
        processNamespaces
    );
}
// native initialize 函数
private native long initialize(String encoding, boolean namespacesEnabled); 

这边调用了native方法initialize函数(org_apache_harmony_xml_ExpatParser.cpp)

static jlong ExpatParser_initialize(JNIEnv* env, jobject object, jstring javaEncoding,
        jboolean processNamespaces) {
    // Allocate parsing context.
    std::unique_ptr<ParsingContext> context(new ParsingContext(object));
    // ......
    context->processNamespaces = processNamespaces;

    // Create a parser.
    XML_Parser parser;
    ScopedUtfChars encoding(env, javaEncoding);
    // ......
    if (processNamespaces) {
        // Use '|' to separate URIs from local names.
        parser = XML_ParserCreateNS(encoding.c_str(), '|');
    } else {
        parser = XML_ParserCreate(encoding.c_str());
    }

    // ...... 设置默认数据
    return fromXMLParser(parser);
} 

设置一些默认的处理函数(external/expat/) 初始化ExpatParser对象,使用XML_ParserCreateNS函数,这个函数位于external下的xmlparse.c文件中

XML_Parser XMLCALL
XML_ParserCreateNS(const XML_Char *encodingName, XML_Char nsSep)
{
  XML_Char tmp[2];
  *tmp = nsSep;
  return XML_ParserCreate_MM(encodingName, NULL, tmp);
} 

即,调用XML_ParserCreate_MM函数

XML_Parser XMLCALL
XML_ParserCreate_MM(const XML_Char *encodingName,
                    const XML_Memory_Handling_Suite *memsuite,
                    const XML_Char *nameSep)
{
  return parserCreate(encodingName, memsuite, nameSep, NULL);
}
// 然后调用parserCreate函数
static XML_Parser
parserCreate(const XML_Char *encodingName,
             const XML_Memory_Handling_Suite *memsuite,
             const XML_Char *nameSep,
             DTD *dtd)
{
    XML_Parser parser;
    // ......
    {
        XML_Memory_Handling_Suite *mtemp;
        // 申请xml解析器的内存
        parser = (XML_Parser)malloc(sizeof(struct XML_ParserStruct));
        if (parser != NULL) {
          mtemp = (XML_Memory_Handling_Suite *)&(parser->m_mem);
          mtemp->malloc_fcn = malloc;
          mtemp->realloc_fcn = realloc;
          mtemp->free_fcn = free;
        }
    }
    // ......初始化一些默认参数
    return parser;
} 

这边主要初始化parser,并且初始化一些参数,最后会调用parserInit函数进行初始化

static void
parserInit(XML_Parser parser, const XML_Char *encodingName) {
    // ...... 初始化默认参数
} 

如上,直接初始化XML_Parser对象,并为其初始化一些默认值 在初始化完成后,调用XML_SetNamespaceDeclHandler等函数设置其一些初始化值,此处不做分析,有兴趣可以自行分析 至此,XML_Parser解析器初始化完成

b. 此后,将处理解析文档 这边主要是调用了ExpatParser的parseDocument函数

/*package*/ void parseDocument(InputStream in) throws IOException,
        SAXException {
    startDocument();
    parseFragment(in);
    finish();
    endDocument();
} 

即,调用了四个函数,最先和最后使用startDocument和endDocument,这个最后会调用到传入的DefaultHandler的startDocument和endDocument的函数 那么接下来看解析XML文件的主要内容函数parseFragment

private void parseFragment(InputStream in)
        throws IOException, SAXException {
    byte[] buffer = new byte[BUFFER_SIZE];
    int length;
    while ((length = in.read(buffer)) != -1) {
        try {
            appendBytes(this.pointer, buffer, 0, length);
        }
        // ...... catch exception code delete
    }
}

private native void appendBytes(long pointer, byte[] xml, int offset,
        int length) throws SAXException, ExpatException; 

从这边可以看到,这边是顺序读取xml文档的内容到内存,然后进行解析(最大为BUFFER_SIZE),然后通过appendBytes函数进行解析,因此

static void ExpatParser_appendBytes(JNIEnv* env, jobject object, jlong pointer,
        jbyteArray xml, jint byteOffset, jint byteCount) {
    ScopedByteArrayRO byteArray(env, xml);
    // ......
    const char* bytes = reinterpret_cast<const char*>(byteArray.get());
    append(env, object, pointer, bytes, byteOffset, byteCount, XML_FALSE);
}

static void append(JNIEnv* env, jobject object, jlong pointer,
        const char* bytes, size_t byteOffset, size_t byteCount, jboolean isFinal) {
    XML_Parser parser = toXMLParser(pointer);
    ParsingContext* context = toParsingContext(parser);
    context->env = env;
    context->object = object;
    if (!XML_Parse(parser, bytes + byteOffset, byteCount, isFinal) && !env->ExceptionCheck()) {
        // ......
    }
    context->object = NULL;
    context->env = NULL;
} 

通过XML_Parse函数解析

enum XML_Status XMLCALL
XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
    if ((parser == NULL) || (len < 0) || ((s == NULL) && (len != 0))) {
        if (parser != NULL)
            parser->m_errorCode = XML_ERROR_INVALID_ARGUMENT;
        return XML_STATUS_ERROR;
    }
    switch (parser->m_parsingStatus.parsing) {
        case XML_SUSPENDED:
            parser->m_errorCode = XML_ERROR_SUSPENDED;
            return XML_STATUS_ERROR;
        case XML_FINISHED:
            parser->m_errorCode = XML_ERROR_FINISHED;
            return XML_STATUS_ERROR;
        // 初始化为此值
        case XML_INITIALIZED:
            if (parser->m_parentParser == NULL && !startParsing(parser)) {
                parser->m_errorCode = XML_ERROR_NO_MEMORY;
                return XML_STATUS_ERROR;
            }
        /* fall through */
        default:
            // 开始解析
            parser->m_parsingStatus.parsing = XML_PARSING;
    }
    // ......
    {
        void *buff = XML_GetBuffer(parser, len);
        if (buff == NULL)
            return XML_STATUS_ERROR;
        else {
            memcpy(buff, s, len);
            // 解析buffer
            return XML_ParseBuffer(parser, len, isFinal);
        }
    }
} 

最后调用XML_ParseBuffer函数进行数据解析

enum XML_Status XMLCALL
XML_ParseBuffer(XML_Parser parser, int len, int isFinal) {
  const char *start;
  enum XML_Status result = XML_STATUS_OK;

  if (parser == NULL)
      return XML_STATUS_ERROR;
  switch (parser->m_parsingStatus.parsing) {
      case XML_SUSPENDED:
          parser->m_errorCode = XML_ERROR_SUSPENDED;
          return XML_STATUS_ERROR;
      case XML_FINISHED:
          parser->m_errorCode = XML_ERROR_FINISHED;
          return XML_STATUS_ERROR;
      case XML_INITIALIZED:
          if (parser->m_parentParser == NULL && !startParsing(parser)) {
              parser->m_errorCode = XML_ERROR_NO_MEMORY;
              return XML_STATUS_ERROR;
          }
      /* fall through */
      default:
          parser->m_parsingStatus.parsing = XML_PARSING;
  }
  // 初始化数据
  start = parser->m_bufferPtr;
  parser->m_positionPtr = start;
  parser->m_bufferEnd += len;
  parser->m_parseEndPtr = parser->m_bufferEnd;
  parser->m_parseEndByteIndex += len;
  parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;
  // 调用m_processor函数,解析XML数据,这边的值在XML_Parser对象初始化的时候设置为prologInitProcessor
  parser->m_errorCode = parser->m_processor(parser, start, parser->m_parseEndPtr, &parser->m_bufferPtr);

  if (parser->m_errorCode != XML_ERROR_NONE) {
      parser->m_eventEndPtr = parser->m_eventPtr;
      parser->m_processor = errorProcessor;
      return XML_STATUS_ERROR;
  } else {
      switch (parser->m_parsingStatus.parsing) {
          case XML_SUSPENDED:
              result = XML_STATUS_SUSPENDED;
              break;
          case XML_INITIALIZED:
          case XML_PARSING:
              if (isFinal) {
                  parser->m_parsingStatus.parsing = XML_FINISHED;
                  return result;
              }
          default: ;  /* should not happen */
      }
  }

  XmlUpdatePosition(parser->m_encoding, parser->m_positionPtr, parser->m_bufferPtr, &parser->m_position);
  parser->m_positionPtr = parser->m_bufferPtr;
  return result;
} 

调用prologInitProcessor函数,解析数据 此后,通过while循环查找对应的数据,读取到内存,重复此阶段,完成所有数据解析

SAX解析XML总结

使用方法

1) 通过SAXParserFactory的newInstance函数创建一个SAXParserFactory对象,再通过其newSAXParser函数,初始化一个SAXParserImpl对象,然后调用其parse函数,将xml文件名和初始化的继承自DefaultHandler类的对象一起作为其参数

2) 在继承自DefaultHandler类的对象中,重新startDocument/endDocument/startElement/endElement/charactors函数,然后一步步解析该xml文件即可 

源码分析

1) 通过SAXParserFactory的newInstance函数创建一个SAXParserFactoryImpl对象,然后通过其newSAXParser函数创建一个SAXParserImpl对象

2) 在创建SAXParserImpl对象的时候,初始化一个ExpatReader对象

3) 调用SAXParserImpl对象的parse函数时候,将xml文件名和初始化的继承自DefaultHandler类的对象作为参数

4) 在parse函数调用的时候,直接调用ExpatReader对象的parse函数,在该parse函数调用的过程中,初始化一个ExpatParser对象,并且调用其parseDocument函数

5) 在初始化ExpatParser对象的时候,调用libexpat库的initialize函数,在该函数中初始化native层的XML_Parser解析器

6) 在调用其parseDocument的时候,将xml文件每一行读取后,进行解析,并随后一步步解析 

SAX解析XML的优缺点

  1. 由于SAX解析过程中,在native层进行解析,因此解析速度比较块
  2. 由于SAX解析xml的过程中,是读取部分数据进行解析,因此使用内存相对较少,而且较固定
  3. 由于SAX解析xml的过程中,需要重写DefaultHandler的一些函数,并且对其部分函数需要重写,而且需要一步步进行解析,因此需要对xml文件的内容,有个相应的了解

扩展

由于SAXParserFactory提供了两个重载函数newInstance,因此在有两个函数的重载newInstance可以客制化自己的解析器。

上一篇 下一篇

猜你喜欢

热点阅读