Those who love sausage and respect the law, it’s better not to see how both are done
public class SimpleTextExtractor { public static void main(String[] args) throws IOException { // , - PdfReader reader = new PdfReader(args[0]); // , PDF . for (int i = 1; i <= reader.getNumberOfPages(); ++i) { TextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); String text = PdfTextExtractor.getTextFromPage(reader, i, strategy); System.out.println(text); } // reader.close(); } }
public void renderText(TextRenderInfo renderInfo)
- when calling getTextFromPage, this function is called with every command that displays text. TextRenderInfo stores all the necessary information: text, font, coordinates. public string GetResultantText()
- this function is called before the end of getTextFromPage and its result will be returned to the user. public class TextExtractionStrategyImpl implements TextExtractionStrategy { private TreeMap<Float, TreeMap<Float, String>> textMap; public TextExtractionStrategyImpl() { // reverseOrder y textMap = new TreeMap<Float, TreeMap<Float, String>>(Collections.reverseOrder()); } @Override public String getResultantText() { StringBuilder stringBuilder = new StringBuilder(); // for (Map.Entry<Float, TreeMap<Float, String>> stringMap: textMap.entrySet()) { // for (Map.Entry<Float, String> entry: stringMap.getValue().entrySet()) { stringBuilder.append(entry.getValue()); } stringBuilder.append('\n'); } return stringBuilder.toString(); } @Override public void beginTextBlock() {} @Override public void renderText(TextRenderInfo renderInfo) { // Float x = renderInfo.getBaseline().getStartPoint().get(Vector.I1); Float y = renderInfo.getBaseline().getStartPoint().get(Vector.I2); // . if (!textMap.containsKey(y)) { textMap.put(y, new TreeMap<Float, String>()); } textMap.get(y).put(x, renderInfo.getText()); } @Override public void endTextBlock() {} @Override public void renderImage(ImageRenderInfo imageRenderInfo) {} // y- ArrayList<Pair<Float, String>> getStringsWithCoordinates() { ArrayList<Pair<Float, String>> result = new ArrayList<Pair<Float, String>>(); for (Map.Entry<Float, TreeMap<Float, String>> stringMap: textMap.entrySet()) { StringBuilder stringBuilder = new StringBuilder(); for (Map.Entry<Float, String> entry: stringMap.getValue().entrySet()) { stringBuilder.append(entry.getValue()); } result.add(new Pair<Float, String>(stringMap.getKey(), stringBuilder.toString())); } return result; } }
public class TextExtractor { public static void main(String[] args) throws IOException { PdfReader reader = new PdfReader(args[0]); for (int i = 1; i <= reader.getNumberOfPages(); ++i) { TextExtractionStrategyImpl strategy = new TextExtractionStrategyImpl(); // , PdfTextExtractor.getTextFromPage(reader, i, strategy); System.out.println("Page : " + i); for (Pair<Float, String> pair: strategy.getStringsWithCoordinates()) { System.out.println(pair.getKey().toString() + " " + pair.getValue()); } } reader.close(); } }
Source: https://habr.com/ru/post/225647/
All Articles