记录:PDF关键字寻找
2019-06-05 本文已影响0人
小鸡在路上
关于获取PDF中关键字位置可以直接用的demo
package com.sign;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
public class BoxKeyPosition extends PDFTextStripper {
private char[] key;
private byte[] src;
private List<float[]> list = new ArrayList<float[]>();
private List<float[]> pagelist = new ArrayList<float[]>();
public BoxKeyPosition(String keyWords, byte[] src) throws IOException {
super();
super.setSortByPosition(true);
this.src = src;
char[] key = new char[keyWords.length()];
for (int i = 0; i < keyWords.length(); i++) {
key[i] = keyWords.charAt(i);
}
this.key = key;
}
public char[] getKey() {
return key;
}
public void setKey(char[] key) {
this.key = key;
}
public byte[] getSrc() {
return src;
}
public void setSrc(byte[] src) {
this.src = src;
}
public List<float[]> getPosition() throws IOException {
try {
document = PDDocument.load(src);
int pages = document.getNumberOfPages();
for (int i = 1; i <= pages; i++) {
pagelist.clear();
super.setSortByPosition(true);
super.setStartPage(i);
super.setEndPage(i);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
super.writeText(document, dummy);
for (float[] li : pagelist) {
li[2] = i;
}
list.addAll(pagelist);
}
return list;
} finally {
if (document != null) {
document.close();
}
}
}
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
for (int i = 0; i < textPositions.size(); i++) {
String str = textPositions.get(i).getUnicode();
if (str.equals(key[0] + "")) {
int count = 0;
for (int j = 1; j < key.length; j++) {
String s = "";
try {
s = textPositions.get(i + j).getUnicode();
} catch (Exception e) {
s = "";
}
if (s.equals(key[j] + "")) {
count++;
}
}
if (count == key.length - 1) {
float[] idx = new float[3];
idx[0] = textPositions.get(i).getX()+key.length*textPositions.get(i).getWidth()/2;
idx[1] = textPositions.get(i).getY()-textPositions.get(i).getHeight();
// idx[3] = textPositions.get(i).getUnicode();
pagelist.add(idx);
}
}
}
}
}
package com.sign;
import java.io.*;
import java.util.List;
/**
* @ClassName SignPostionTest
* @Description TODD
* @Author MG01857
* @Date 2018/12/15
* @Version 1.0
**/
public class SignPostionTest {
private static byte[] toByteArray(InputStream in) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
byte[] buffer = new byte[1024 * 4];
int n = 0;
while ((n = in.read(buffer)) != -1) {
out.write(buffer, 0, n);
}
return out.toByteArray();
}
public static void main(String[] args) throws Exception {
/* SignPostion signPostion = new SignPostion();
List<float[]> keyWords = signPostion.getKeyWords("C:\\Users\\MG01857\\Desktop\\AZ新合同生成\\埋点测试\\爱家分期服务合同(金融机构、消费者).pdf",
null, "borrower ");
System.out.println(keyWords);*/
String filePath = "C:\\Users\\MG01857\\Desktop\\pdf生成浏览\\世联信贷征信查询授权书_word转PDF_黄智炜.pdf";
InputStream in = new FileInputStream(filePath);
byte[] data = toByteArray(in);
in.close();
BoxKeyPosition boxKeyPosition = new BoxKeyPosition("borrower",data);
//List<float[]> position = boxKeyPosition.getPosition();
List<float[]> position = boxKeyPosition.getPosition();
for (float[] f : position){
System.out.println(f.toString());
}
}
}