通过改造源码 Linux解决pdf发票转图片中文乱码
2023-01-12 本文已影响0人
小胖学编程
使用pdfbox将pdf转化为图片时,因为Linux服务器上缺乏足够的字体,会导致pdf发票转化出来存在中午乱码。
解决这个问题的办法:
- linux中下载对于的字体,但是对于多台服务器的场景或者docker容器的场景需要运维;
- 修改源码,默认对齐java包中的字体。
采用的是第二种方式来增加对应的字体信息:
引入依赖:
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.21</version>
</dependency>
2. 修改源码
本次方案是直接在业务工程中进行修改的,依赖的是maven的就近原则,相同包名的类优先读取项目中的。不过这种方式不太推荐,推荐将pdfbox
源码拉取下私服仓库,进行修改。
源码见附录。
其实本质就是修改了两行代码,不同版本的pdfbox可以选择性的修改源码。
image.png字体存放到resource目录下:下载地址
3. pdf转图片
public static BufferedImage transferPdfToImage(File file) throws IOException {
PDDocument document = PDDocument.load(file);
PDFRenderer pdfRenderer = new PDFRenderer(document);
//注意此处的参数100可以调整,值越大图片越清晰
BufferedImage img = pdfRenderer.renderImageWithDPI(0, 110, ImageType.RGB);
document.close();
return img;
}
图片写入二进制流:
BufferedImage bufferedImag = transfer2Img(file);
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
ImageIO.write(bufferedImag, PNG, outputStream);
图片写入本地文件:
BufferedImage bufferedImag = transfer2Img(file);
Path dist = Paths.get("/Users/xxx/Downloads", "xxy2.png");
ImageIO.write(bufferedImag, "png", dist.toFile());
4. 附录
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// CHECKSTYLE:OFF
package org.apache.pdfbox.pdmodel.font;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.fontbox.FontBoxFont;
import org.apache.fontbox.ttf.OpenTypeFont;
import org.apache.fontbox.ttf.TTFParser;
import org.apache.fontbox.ttf.TrueTypeFont;
import org.apache.fontbox.type1.Type1Font;
/**
* Font mapper, locates non-embedded fonts via a pluggable FontProvider.
*
* @author John Hewson
*/
final class FontMapperImpl implements FontMapper
{
private static final Log LOG = LogFactory.getLog(FontMapperImpl.class);
private static final FontCache fontCache = new FontCache(); // todo: static cache isn't ideal
private FontProvider fontProvider;
private Map<String, FontInfo> fontInfoByName;
private final TrueTypeFont lastResortFont;
/** Map of PostScript name substitutes, in priority order. */
private final Map<String, List<String>> substitutes = new HashMap<String, List<String>>();
FontMapperImpl()
{
// substitutes for standard 14 fonts
substitutes.put("Courier",
Arrays.asList("CourierNew", "CourierNewPSMT", "LiberationMono", "NimbusMonL-Regu"));
substitutes.put("Courier-Bold",
Arrays.asList("CourierNewPS-BoldMT", "CourierNew-Bold", "LiberationMono-Bold",
"NimbusMonL-Bold"));
substitutes.put("Courier-Oblique",
Arrays.asList("CourierNewPS-ItalicMT","CourierNew-Italic",
"LiberationMono-Italic", "NimbusMonL-ReguObli"));
substitutes.put("Courier-BoldOblique",
Arrays.asList("CourierNewPS-BoldItalicMT","CourierNew-BoldItalic",
"LiberationMono-BoldItalic", "NimbusMonL-BoldObli"));
substitutes.put("Helvetica",
Arrays.asList("ArialMT", "Arial", "LiberationSans", "NimbusSanL-Regu"));
substitutes.put("Helvetica-Bold",
Arrays.asList("Arial-BoldMT", "Arial-Bold", "LiberationSans-Bold",
"NimbusSanL-Bold"));
substitutes.put("Helvetica-Oblique",
Arrays.asList("Arial-ItalicMT", "Arial-Italic", "Helvetica-Italic",
"LiberationSans-Italic", "NimbusSanL-ReguItal"));
substitutes.put("Helvetica-BoldOblique",
Arrays.asList("Arial-BoldItalicMT", "Helvetica-BoldItalic",
"LiberationSans-BoldItalic", "NimbusSanL-BoldItal"));
substitutes.put("Times-Roman",
Arrays.asList("TimesNewRomanPSMT", "TimesNewRoman", "TimesNewRomanPS",
"LiberationSerif", "NimbusRomNo9L-Regu"));
substitutes.put("Times-Bold",
Arrays.asList("TimesNewRomanPS-BoldMT", "TimesNewRomanPS-Bold",
"TimesNewRoman-Bold", "LiberationSerif-Bold",
"NimbusRomNo9L-Medi"));
substitutes.put("Times-Italic",
Arrays.asList("TimesNewRomanPS-ItalicMT", "TimesNewRomanPS-Italic",
"TimesNewRoman-Italic", "LiberationSerif-Italic",
"NimbusRomNo9L-ReguItal"));
substitutes.put("Times-BoldItalic",
Arrays.asList("TimesNewRomanPS-BoldItalicMT", "TimesNewRomanPS-BoldItalic",
"TimesNewRoman-BoldItalic", "LiberationSerif-BoldItalic",
"NimbusRomNo9L-MediItal"));
substitutes.put("Symbol", Arrays.asList("Symbol", "SymbolMT", "StandardSymL"));
substitutes.put("ZapfDingbats", Arrays.asList("ZapfDingbatsITC", "Dingbats", "MS-Gothic"));
substitutes.put("STSong-Light", Arrays.asList("STSONG-light","DengXian"));//自己加的
// Acrobat also uses alternative names for Standard 14 fonts, which we map to those above
// these include names such as "Arial" and "TimesNewRoman"
for (String baseName : Standard14Fonts.getNames())
{
if (!substitutes.containsKey(baseName))
{
String mappedName = Standard14Fonts.getMappedFontName(baseName);
substitutes.put(baseName, copySubstitutes(mappedName));
}
}
// -------------------------
try
{
// String ttfName = "/org/apache/pdfbox/resources/ttf/LiberationSans-Regular.ttf";
String ttfName = "ttf/STSONG-light.ttf";
LOG.info("加载自定义字体文件---> "+ttfName);
InputStream resourceAsStream = FontMapperImpl.class.getClassLoader().getResourceAsStream(ttfName);
InputStream ttfStream =
new BufferedInputStream(resourceAsStream);
TTFParser ttfParser = new TTFParser();
lastResortFont = ttfParser.parse(ttfStream);
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
// lazy thread safe singleton
private static class DefaultFontProvider
{
private static final FontProvider INSTANCE = new FileSystemFontProvider(fontCache);
}
/**
* Sets the font service provider.
*/
public synchronized void setProvider(FontProvider fontProvider)
{
fontInfoByName = createFontInfoByName(fontProvider.getFontInfo());
this.fontProvider = fontProvider;
}
/**
* Returns the font service provider. Defaults to using FileSystemFontProvider.
*/
public synchronized FontProvider getProvider()
{
if (fontProvider == null)
{
setProvider(DefaultFontProvider.INSTANCE);
}
return fontProvider;
}
/**
* Returns the font cache associated with this FontMapper. This method is needed by
* FontProvider subclasses.
*/
public FontCache getFontCache()
{
return fontCache;
}
private Map<String, FontInfo> createFontInfoByName(List<? extends FontInfo> fontInfoList)
{
Map<String, FontInfo> map = new LinkedHashMap<String, FontInfo>();
for (FontInfo info : fontInfoList)
{
for (String name : getPostScriptNames(info.getPostScriptName()))
{
map.put(name, info);
}
}
return map;
}
/**
* Gets alternative names, as seen in some PDFs, e.g. PDFBOX-142.
*/
private Set<String> getPostScriptNames(String postScriptName)
{
Set<String> names = new HashSet<String>();
// built-in PostScript name
names.add(postScriptName);
// remove hyphens (e.g. Arial-Black -> ArialBlack)
names.add(postScriptName.replace("-", ""));
return names;
}
/**
* Copies a list of font substitutes, adding the original font at the start of the list.
*/
private List<String> copySubstitutes(String postScriptName)
{
return new ArrayList<String>(substitutes.get(postScriptName));
}
/**
* Adds a top-priority substitute for the given font.
*
* @param match PostScript name of the font to match
* @param replace PostScript name of the font to use as a replacement
*/
public void addSubstitute(String match, String replace)
{
if (!substitutes.containsKey(match))
{
substitutes.put(match, new ArrayList<String>());
}
substitutes.get(match).add(replace);
}
/**
* Returns the substitutes for a given font.
*/
private List<String> getSubstitutes(String postScriptName)
{
List<String> subs = substitutes.get(postScriptName.replace(" ", ""));
if (subs != null)
{
return subs;
}
else
{
return Collections.emptyList();
}
}
/**
* Attempts to find a good fallback based on the font descriptor.
*/
private String getFallbackFontName(PDFontDescriptor fontDescriptor)
{
String fontName;
if (fontDescriptor != null)
{
// heuristic detection of bold
boolean isBold = false;
String name = fontDescriptor.getFontName();
if (name != null)
{
String lower = fontDescriptor.getFontName().toLowerCase();
isBold = lower.contains("bold") ||
lower.contains("black") ||
lower.contains("heavy");
}
// font descriptor flags should describe the style
if (fontDescriptor.isFixedPitch())
{
fontName = "Courier";
if (isBold && fontDescriptor.isItalic())
{
fontName += "-BoldOblique";
}
else if (isBold)
{
fontName += "-Bold";
}
else if (fontDescriptor.isItalic())
{
fontName += "-Oblique";
}
}
else if (fontDescriptor.isSerif())
{
fontName = "Times";
if (isBold && fontDescriptor.isItalic())
{
fontName += "-BoldItalic";
}
else if (isBold)
{
fontName += "-Bold";
}
else if (fontDescriptor.isItalic())
{
fontName += "-Italic";
}
else
{
fontName += "-Roman";
}
}
else
{
fontName = "Helvetica";
if (isBold && fontDescriptor.isItalic())
{
fontName += "-BoldOblique";
}
else if (isBold)
{
fontName += "-Bold";
}
else if (fontDescriptor.isItalic())
{
fontName += "-Oblique";
}
}
}
else
{
// if there is no FontDescriptor then we just fall back to Times Roman
fontName = "Times-Roman";
}
return fontName;
}
/**
* Finds a TrueType font with the given PostScript name, or a suitable substitute, or null.
*
* @param fontDescriptor FontDescriptor
*/
@Override
public FontMapping<TrueTypeFont> getTrueTypeFont(String baseFont,
PDFontDescriptor fontDescriptor)
{
TrueTypeFont ttf = (TrueTypeFont)findFont(FontFormat.TTF, baseFont);
if (ttf != null)
{
return new FontMapping<TrueTypeFont>(ttf, false);
}
else
{
// fallback - todo: i.e. fuzzy match
String fontName = getFallbackFontName(fontDescriptor);
ttf = (TrueTypeFont) findFont(FontFormat.TTF, fontName);
if (ttf == null)
{
// we have to return something here as TTFs aren't strictly required on the system
ttf = lastResortFont;
}
return new FontMapping<TrueTypeFont>(ttf, true);
}
}
/**
* Finds a font with the given PostScript name, or a suitable substitute, or null. This allows
* any font to be substituted with a PFB, TTF or OTF.
*
* @param fontDescriptor the FontDescriptor of the font to find
*/
@Override
public FontMapping<FontBoxFont> getFontBoxFont(String baseFont,
PDFontDescriptor fontDescriptor)
{
FontBoxFont font = findFontBoxFont(baseFont);
if (font != null)
{
return new FontMapping<FontBoxFont>(font, false);
}
else
{
// fallback - todo: i.e. fuzzy match
String fallbackName = getFallbackFontName(fontDescriptor);
font = findFontBoxFont(fallbackName);
if (font == null)
{
// we have to return something here as TTFs aren't strictly required on the system
font = lastResortFont;
}
return new FontMapping<FontBoxFont>(font, true);
}
}
/**
* Finds a font with the given PostScript name, or a suitable substitute, or null.
*
* @param postScriptName PostScript font name
*/
private FontBoxFont findFontBoxFont(String postScriptName)
{
Type1Font t1 = (Type1Font)findFont(FontFormat.PFB, postScriptName);
if (t1 != null)
{
return t1;
}
TrueTypeFont ttf = (TrueTypeFont)findFont(FontFormat.TTF, postScriptName);
if (ttf != null)
{
return ttf;
}
OpenTypeFont otf = (OpenTypeFont) findFont(FontFormat.OTF, postScriptName);
if (otf != null)
{
return otf;
}
return null;
}
/**
* Finds a font with the given PostScript name, or a suitable substitute, or null.
*
* @param postScriptName PostScript font name
*/
private FontBoxFont findFont(FontFormat format, String postScriptName)
{
// handle damaged PDFs, see PDFBOX-2884
if (postScriptName == null)
{
return null;
}
// make sure the font provider is initialized
if (fontProvider == null)
{
getProvider();
}
// first try to match the PostScript name
FontInfo info = getFont(format, postScriptName);
if (info != null)
{
return info.getFont();
}
// remove hyphens (e.g. Arial-Black -> ArialBlack)
info = getFont(format, postScriptName.replace("-", ""));
if (info != null)
{
return info.getFont();
}
// then try named substitutes
for (String substituteName : getSubstitutes(postScriptName))
{
info = getFont(format, substituteName);
if (info != null)
{
return info.getFont();
}
}
// then try converting Windows names e.g. (ArialNarrow,Bold) -> (ArialNarrow-Bold)
info = getFont(format, postScriptName.replace(",", "-"));
if (info != null)
{
return info.getFont();
}
// try appending "-Regular", works for Wingdings on windows
info = getFont(format, postScriptName + "-Regular");
if (info != null)
{
return info.getFont();
}
// no matches
return null;
}
/**
* Finds the named font with the given format.
*/
private FontInfo getFont(FontFormat format, String postScriptName)
{
// strip subset tag (happens when we substitute a corrupt embedded font, see PDFBOX-2642)
if (postScriptName.contains("+"))
{
postScriptName = postScriptName.substring(postScriptName.indexOf('+') + 1);
}
// look up the PostScript name
FontInfo info = fontInfoByName.get(postScriptName);
if (info != null && info.getFormat() == format)
{
if (LOG.isDebugEnabled())
{
LOG.debug(String.format("getFont('%s','%s') returns %s", format, postScriptName, info));
}
return info;
}
return null;
}
/**
* Finds a CFF CID-Keyed font with the given PostScript name, or a suitable substitute, or null.
* This method can also map CJK fonts via their CIDSystemInfo (ROS).
*
* @param fontDescriptor FontDescriptor
* @param cidSystemInfo the CID system info, e.g. "Adobe-Japan1", if any.
*/
@Override
public CIDFontMapping getCIDFont(String baseFont, PDFontDescriptor fontDescriptor,
PDCIDSystemInfo cidSystemInfo)
{
// try name match or substitute with OTF
OpenTypeFont otf1 = (OpenTypeFont)findFont(FontFormat.OTF, baseFont);
if (otf1 != null)
{
return new CIDFontMapping(otf1, null, false);
}
// try name match or substitute with TTF
TrueTypeFont ttf = (TrueTypeFont)findFont(FontFormat.TTF, baseFont);
if (ttf != null)
{
return new CIDFontMapping(null, ttf, false);
}
if (cidSystemInfo != null)
{
// "In Acrobat 3.0.1 and later, Type 0 fonts that use a CMap whose CIDSystemInfo
// dictionary defines the Adobe-GB1, Adobe-CNS1 Adobe-Japan1, or Adobe-Korea1 character
// collection can also be substituted." - Adobe Supplement to the ISO 32000
String collection = cidSystemInfo.getRegistry() + "-" + cidSystemInfo.getOrdering();
if (collection.equals("Adobe-GB1") || collection.equals("Adobe-CNS1") ||
collection.equals("Adobe-Japan1") || collection.equals("Adobe-Korea1"))
{
// try automatic substitutes via character collection
PriorityQueue<FontMatch> queue = getFontMatches(fontDescriptor, cidSystemInfo);
FontMatch bestMatch = queue.poll();
if (bestMatch != null)
{
if (LOG.isDebugEnabled())
{
LOG.debug("Best match for '" + baseFont + "': " + bestMatch.info);
}
FontBoxFont font = bestMatch.info.getFont();
if (font instanceof OpenTypeFont)
{
return new CIDFontMapping((OpenTypeFont)font, null, true);
}
else if (font != null)
{
return new CIDFontMapping(null, font, true);
}
}
}
}
// last-resort fallback
return new CIDFontMapping(null, lastResortFont, true);
}
/**
* Returns a list of matching fonts, scored by suitability. Positive scores indicate matches
* for certain attributes, while negative scores indicate mismatches. Zero scores are neutral.
*
* @param fontDescriptor FontDescriptor, always present.
* @param cidSystemInfo Font's CIDSystemInfo, may be null.
*/
private PriorityQueue<FontMatch> getFontMatches(PDFontDescriptor fontDescriptor,
PDCIDSystemInfo cidSystemInfo)
{
PriorityQueue<FontMatch> queue = new PriorityQueue<FontMatch>(20);
for (FontInfo info : fontInfoByName.values())
{
// filter by CIDSystemInfo, if given
if (cidSystemInfo != null && !isCharSetMatch(cidSystemInfo, info))
{
continue;
}
FontMatch match = new FontMatch(info);
// Panose is the most reliable
if (fontDescriptor.getPanose() != null && info.getPanose() != null)
{
PDPanoseClassification panose = fontDescriptor.getPanose().getPanose();
if (panose.getFamilyKind() == info.getPanose().getFamilyKind())
{
if (panose.getFamilyKind() == 0 &&
(info.getPostScriptName().toLowerCase().contains("barcode") ||
info.getPostScriptName().startsWith("Code")) &&
!probablyBarcodeFont(fontDescriptor))
{
// PDFBOX-4268: ignore barcode font if we aren't searching for one.
continue;
}
// serifs
if (panose.getSerifStyle() == info.getPanose().getSerifStyle())
{
// exact match
match.score += 2;
}
else if (panose.getSerifStyle() >= 2 && panose.getSerifStyle() <= 5 &&
info.getPanose().getSerifStyle() >= 2 &&
info.getPanose().getSerifStyle() <= 5)
{
// cove (serif)
match.score += 1;
}
else if (panose.getSerifStyle() >= 11 && panose.getSerifStyle() <= 13 &&
info.getPanose().getSerifStyle() >= 11 &&
info.getPanose().getSerifStyle() <= 13)
{
// sans-serif
match.score += 1;
}
else if (panose.getSerifStyle() != 0 && info.getPanose().getSerifStyle() != 0)
{
// mismatch
match.score -= 1;
}
// weight
int weight = info.getPanose().getWeight();
int weightClass = info.getWeightClassAsPanose();
if (Math.abs(weight - weightClass) > 2)
{
// inconsistent data in system font, usWeightClass wins
weight = weightClass;
}
if (panose.getWeight() == weight)
{
// exact match
match.score += 2;
}
else if (panose.getWeight() > 1 && weight > 1)
{
float dist = Math.abs(panose.getWeight() - weight);
match.score += 1 - dist * 0.5;
}
// todo: italic
// ...
}
}
else if (fontDescriptor.getFontWeight() > 0 && info.getWeightClass() > 0)
{
// usWeightClass is pretty reliable
float dist = Math.abs(fontDescriptor.getFontWeight() - info.getWeightClass());
match.score += 1 - (dist / 100) * 0.5;
}
// todo: italic
// ...
queue.add(match);
}
return queue;
}
private boolean probablyBarcodeFont(PDFontDescriptor fontDescriptor)
{
String ff = fontDescriptor.getFontFamily();
if (ff == null)
{
ff = "";
}
String fn = fontDescriptor.getFontName();
if (fn == null)
{
fn = "";
}
return ff.startsWith("Code") || ff.toLowerCase().contains("barcode") ||
fn.startsWith("Code") || fn.toLowerCase().contains("barcode");
}
/**
* Returns true if the character set described by CIDSystemInfo is present in the given font.
* Only applies to Adobe-GB1, Adobe-CNS1, Adobe-Japan1, Adobe-Korea1, as per the PDF spec.
*/
private boolean isCharSetMatch(PDCIDSystemInfo cidSystemInfo, FontInfo info)
{
if (info.getCIDSystemInfo() != null)
{
return info.getCIDSystemInfo().getRegistry().equals(cidSystemInfo.getRegistry()) &&
info.getCIDSystemInfo().getOrdering().equals(cidSystemInfo.getOrdering());
}
else
{
long codePageRange = info.getCodePageRange();
long JIS_JAPAN = 1 << 17;
long CHINESE_SIMPLIFIED = 1 << 18;
long KOREAN_WANSUNG = 1 << 19;
long CHINESE_TRADITIONAL = 1 << 20;
long KOREAN_JOHAB = 1 << 21;
if ("MalgunGothic-Semilight".equals(info.getPostScriptName()))
{
// PDFBOX-4793 and PDF.js 10699: This font has only Korean, but has bits 17-21 set.
codePageRange &= ~(JIS_JAPAN | CHINESE_SIMPLIFIED | CHINESE_TRADITIONAL);
}
if (cidSystemInfo.getOrdering().equals("GB1") &&
(codePageRange & CHINESE_SIMPLIFIED) == CHINESE_SIMPLIFIED)
{
return true;
}
else if (cidSystemInfo.getOrdering().equals("CNS1") &&
(codePageRange & CHINESE_TRADITIONAL) == CHINESE_TRADITIONAL)
{
return true;
}
else if (cidSystemInfo.getOrdering().equals("Japan1") &&
(codePageRange & JIS_JAPAN) == JIS_JAPAN)
{
return true;
}
else
{
return cidSystemInfo.getOrdering().equals("Korea1") &&
((codePageRange & KOREAN_WANSUNG) == KOREAN_WANSUNG ||
(codePageRange & KOREAN_JOHAB) == KOREAN_JOHAB);
}
}
}
/**
* A potential match for a font substitution.
*/
private static class FontMatch implements Comparable<FontMatch>
{
double score;
final FontInfo info;
FontMatch(FontInfo info)
{
this.info = info;
}
@Override
public int compareTo(FontMatch match)
{
return Double.compare(match.score, this.score);
}
}
/**
* For debugging. Prints all matches and returns the best match.
*/
private FontMatch printMatches(PriorityQueue<FontMatch> queue)
{
FontMatch bestMatch = queue.peek();
System.out.println("-------");
while (!queue.isEmpty())
{
FontMatch match = queue.poll();
FontInfo info = match.info;
System.out.println(match.score + " | " + info.getMacStyle() + " " +
info.getFamilyClass() + " " + info.getPanose() + " " +
info.getCIDSystemInfo() + " " + info.getPostScriptName() + " " +
info.getFormat());
}
System.out.println("-------");
return bestMatch;
}
}