お忙しいところお手数ですが質問させてください。
クロールによって作成されたインデックスと実ファイルに差分がありました。
fess-crawler.logおよび障害URLを確認したところ以下の4点のエラーメッセージを確認しております。
特に1)、2)の原因特定ができていないため、
ご知見ございましたらご教授いただきたく存じます。
1)java.lang.ArrayIndexOutOfBoundsException
2023-06-21 11:20:12,956 [Crawler-20230621111833-1-5] WARN Unexpected error processing command, ignoring and continuing. Command: org.apache.poi.hdgf.chunks.Chunk$Command@476c7347
java.lang.ArrayIndexOutOfBoundsException: Index 23 out of bounds for length 20
at org.apache.poi.util.LittleEndian.getLong(LittleEndian.java:155) ~[poi-5.2.3.jar:5.2.3]
at org.apache.poi.util.LittleEndian.getDouble(LittleEndian.java:74) ~[poi-5.2.3.jar:5.2.3]
at org.apache.poi.hdgf.chunks.Chunk.processCommands(Chunk.java:199) ~[poi-scratchpad-5.2.3.jar:5.2.3]
at org.apache.poi.hdgf.chunks.ChunkFactory.createChunk(ChunkFactory.java:207) ~[poi-scratchpad-5.2.3.jar:5.2.3]
at org.apache.poi.hdgf.streams.ChunkStream.findChunks(ChunkStream.java:66) ~[poi-scratchpad-5.2.3.jar:5.2.3]
at org.apache.poi.hdgf.streams.PointerContainingStream.findChildren(PointerContainingStream.java:70) ~[poi-scratchpad-5.2.3.jar:5.2.3]
at org.apache.poi.hdgf.streams.PointerContainingStream.findChildren(PointerContainingStream.java:77) ~[poi-scratchpad-5.2.3.jar:5.2.3]
at org.apache.poi.hdgf.HDGFDiagram.(HDGFDiagram.java:89) ~[poi-scratchpad-5.2.3.jar:5.2.3]
at org.apache.poi.hdgf.extractor.VisioTextExtractor.(VisioTextExtractor.java:52) ~[poi-scratchpad-5.2.3.jar:5.2.3]
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:232) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:175) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:298) ~[tika-core-2.6.0.jar:2.6.0]
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:298) ~[tika-core-2.6.0.jar:2.6.0]
at org.codelibs.fess.crawler.extractor.impl.TikaExtractor$TikaDetectParser.parse(TikaExtractor.java:507) ~[fess-crawler-14.6.1.jar:?]
at org.apache.tika.parser.DelegatingParser.parse(DelegatingParser.java:71) ~[tika-core-2.6.0.jar:2.6.0]
at org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor.parseEmbedded(ParsingEmbeddedDocumentExtractor.java:109) ~[tika-core-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.handleEmbeddedFile(AbstractOOXMLExtractor.java:406) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.handleEmbeddedPart(AbstractOOXMLExtractor.java:275) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.handleEmbeddedParts(AbstractOOXMLExtractor.java:217) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.getXHTML(AbstractOOXMLExtractor.java:138) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.ooxml.OOXMLExtractorFactory.parse(OOXMLExtractorFactory.java:243) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.ooxml.OOXMLParser.parse(OOXMLParser.java:115) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:269) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:175) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:298) ~[tika-core-2.6.0.jar:2.6.0]
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:298) ~[tika-core-2.6.0.jar:2.6.0]
at org.codelibs.fess.crawler.extractor.impl.TikaExtractor$TikaDetectParser.parse(TikaExtractor.java:507) ~[fess-crawler-14.6.1.jar:?]
at org.codelibs.fess.crawler.extractor.impl.TikaExtractor.lambda$getText$0(TikaExtractor.java:194) ~[fess-crawler-14.6.1.jar:?]
at org.codelibs.fess.crawler.extractor.impl.TikaExtractor.getContent(TikaExtractor.java:404) ~[fess-crawler-14.6.1.jar:?]
at org.codelibs.fess.crawler.extractor.impl.TikaExtractor.getText(TikaExtractor.java:183) ~[fess-crawler-14.6.1.jar:?]
at org.codelibs.fess.crawler.transformer.AbstractFessFileTransformer.getExtractData(AbstractFessFileTransformer.java:387) ~[classes/:?]
at org.codelibs.fess.crawler.transformer.AbstractFessFileTransformer.generateData(AbstractFessFileTransformer.java:100) ~[classes/:?]
at org.codelibs.fess.crawler.transformer.AbstractFessFileTransformer.transform(AbstractFessFileTransformer.java:81) ~[classes/:?]
at org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor.process(DefaultResponseProcessor.java:74) ~[fess-crawler-14.6.1.jar:?]
at org.codelibs.fess.crawler.CrawlerThread.processResponse(CrawlerThread.java:291) ~[fess-crawler-14.6.1.jar:?]
at org.codelibs.fess.crawler.FessCrawlerThread.processResponse(FessCrawlerThread.java:249) ~[classes/:?]
at org.codelibs.fess.crawler.CrawlerThread.run(CrawlerThread.java:162) ~[fess-crawler-14.6.1.jar:?]
at java.lang.Thread.run(Thread.java:833) ~[?:?]
2)java.lang.IllegalArgumentException
2023-06-21 11:20:23,978 [Crawler-20230621111833-1-3] WARN Failed to create chunk at 20885, ignoring rest of data.
java.lang.IllegalArgumentException: Found a chunk with a negative length, which isn’t allowed
at org.apache.poi.hdgf.chunks.ChunkFactory.createChunk(ChunkFactory.java:149) ~[poi-scratchpad-5.2.3.jar:5.2.3]
at org.apache.poi.hdgf.streams.ChunkStream.findChunks(ChunkStream.java:66) ~[poi-scratchpad-5.2.3.jar:5.2.3]
at org.apache.poi.hdgf.streams.PointerContainingStream.findChildren(PointerContainingStream.java:70) ~[poi-scratchpad-5.2.3.jar:5.2.3]
at org.apache.poi.hdgf.streams.PointerContainingStream.findChildren(PointerContainingStream.java:77) ~[poi-scratchpad-5.2.3.jar:5.2.3]
at org.apache.poi.hdgf.HDGFDiagram.(HDGFDiagram.java:89) ~[poi-scratchpad-5.2.3.jar:5.2.3]
at org.apache.poi.hdgf.extractor.VisioTextExtractor.(VisioTextExtractor.java:52) ~[poi-scratchpad-5.2.3.jar:5.2.3]
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:232) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:175) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:298) ~[tika-core-2.6.0.jar:2.6.0]
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:298) ~[tika-core-2.6.0.jar:2.6.0]
at org.codelibs.fess.crawler.extractor.impl.TikaExtractor$TikaDetectParser.parse(TikaExtractor.java:507) ~[fess-crawler-14.6.1.jar:?]
at org.apache.tika.parser.DelegatingParser.parse(DelegatingParser.java:71) ~[tika-core-2.6.0.jar:2.6.0]
at org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor.parseEmbedded(ParsingEmbeddedDocumentExtractor.java:109) ~[tika-core-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.handleEmbeddedFile(AbstractOOXMLExtractor.java:406) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.handleEmbeddedPart(AbstractOOXMLExtractor.java:275) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.handleEmbeddedParts(AbstractOOXMLExtractor.java:217) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.getXHTML(AbstractOOXMLExtractor.java:138) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.ooxml.OOXMLExtractorFactory.parse(OOXMLExtractorFactory.java:243) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.ooxml.OOXMLParser.parse(OOXMLParser.java:115) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:269) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:175) ~[tika-parser-microsoft-module-2.6.0.jar:2.6.0]
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:298) ~[tika-core-2.6.0.jar:2.6.0]
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:298) ~[tika-core-2.6.0.jar:2.6.0]
at org.codelibs.fess.crawler.extractor.impl.TikaExtractor$TikaDetectParser.parse(TikaExtractor.java:507) ~[fess-crawler-14.6.1.jar:?]
at org.codelibs.fess.crawler.extractor.impl.TikaExtractor.lambda$getText$0(TikaExtractor.java:194) ~[fess-crawler-14.6.1.jar:?]
at org.codelibs.fess.crawler.extractor.impl.TikaExtractor.getContent(TikaExtractor.java:404) ~[fess-crawler-14.6.1.jar:?]
at org.codelibs.fess.crawler.extractor.impl.TikaExtractor.getText(TikaExtractor.java:183) ~[fess-crawler-14.6.1.jar:?]
at org.codelibs.fess.crawler.transformer.AbstractFessFileTransformer.getExtractData(AbstractFessFileTransformer.java:387) ~[classes/:?]
at org.codelibs.fess.crawler.transformer.AbstractFessFileTransformer.generateData(AbstractFessFileTransformer.java:100) ~[classes/:?]
at org.codelibs.fess.crawler.transformer.AbstractFessFileTransformer.transform(AbstractFessFileTransformer.java:81) ~[classes/:?]
at org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor.process(DefaultResponseProcessor.java:74) ~[fess-crawler-14.6.1.jar:?]
at org.codelibs.fess.crawler.CrawlerThread.processResponse(CrawlerThread.java:291) ~[fess-crawler-14.6.1.jar:?]
at org.codelibs.fess.crawler.FessCrawlerThread.processResponse(FessCrawlerThread.java:249) ~[classes/:?]
at org.codelibs.fess.crawler.CrawlerThread.run(CrawlerThread.java:162) ~[fess-crawler-14.6.1.jar:?]
3)java.net.URISyntaxException: Illegal character in path at
→ファイル名にスペースが含まれると発生する。ファイル名を修正する対応予定。
他にfess側で対応すること可能なのでしょうか。
4)MaxLengthExceededException
→ファイルサイズの上限を超えた際に発生するエラー。デフォルトは10MB。
設定上の上限値はありますでしょうか。