# source SourceParseXml { # xmlpipe2, type = xmlpipe2 # , , XML. xml-parser.php XML xmlpipe_command = /path/to/php /path/to/xml-parser.php } # index IndexParseXml { # , source = SourceParseXml # path = /path/where/to/store/index-data # min_word_len = 1 # charset_type = utf-8 } # indexer { # mem_limit = 32M }
$allowedMimes = array( 'application/pdf', 'application/zip', // docx 'application/vnd.ms-office', // doc OpenOffice 'application/msword' ); $allowedExtentions = array('pdf', 'docx', 'doc'); $pdfInfoPath = '/usr/bin/pdfinfo'; // pdfinfo $pdfToTextPath = '/usr/bin/pdftotext'; // pdftotext $catDocPath = '/usr/bin/catdoc'; // catdoc // Znd- docx define('FILE_PATH', realpath(dirname(__FILE__)) . DIRECTORY_SEPARATOR); if (strpos(get_include_path(), 'Zend.phar.gz') === false) { ini_set('include_path', ini_get('include_path') . ':phar\://' . FILE_PATH . 'Zend.phar.gz'); } $phar = new Phar(FILE_PATH . 'Zend.phar.gz', 0, 'Zend.phar.gz'); if (isset($phar['Zend/Search/Lucene/Document/Docx.php'])) { require_once($phar['Zend/Search/Lucene/Document/Docx.php']); } else { echo 'ERROR: can\'t load "Zend/Search/Lucene/Document/Docx.php" !' . PHP_EOL; die; } // XML $xmlWriter = new xmlWriter(); $xmlWriter->openMemory(); $xmlWriter->setIndent(true); $xmlWriter->startDocument('1.0', 'UTF-8'); $xmlWriter->startElement('sphinx:docset'); $xmlWriter->startElement('sphinx:schema'); $xmlWriter->startElement('sphinx:field'); $xmlWriter->writeAttribute('name', 'content'); $xmlWriter->endElement(); // field $xmlWriter->endElement(); // schema /* files, , id . : files/01/file.pdf */ // files foreach (new DirectoryIterator(dirname(__FILE__) . '/files') as $folder) { // , if (!$folder->isDir() || $folder->isDot()) { continue; } // foreach (new DirectoryIterator($folder->getPathname()) as $file) { // if ($file->isDir() || !in_array(strtolower(pathinfo($file, PATHINFO_EXTENSION)), $allowedExtentions) || !in_array(mime_content_type($file->getPathname()), $allowedMimes)) { continue; } $text = ''; $filePath = $file->getPathname(); $filePathEscape = escapeshellarg($filePath); try { switch (mime_content_type($filePath)) { case 'application/pdf': { $pdfInfo = array(); $key = ''; $val = ''; // PDF foreach (explode("\n", shell_exec(escapeshellcmd($pdfInfoPath . ' ' . $filePathEscape))) as $str) { list($key, $val) = count(explode(':', $str)) == 2 ? explode(':', $str) : array('', ''); if (trim($key) && trim($val)) { $pdfInfo[trim($key)] = trim($val); } } // if (empty($pdfInfo) || (isset($pdfInfo['Error']) && $pdfInfo['Error'])) { continue; } // pdftotext $text = shell_exec(escapeshellcmd($pdfToTextPath . ' -nopgbrk ' . $filePathEscape . ' -')); break; } case 'application/zip' : { $file = Zend_Search_Lucene_Document_Docx::loadDocxFile($filePath); // Zend_Search_Lucene_Document_Docx $text = $file->getFieldValue('body'); break; } case ('application/vnd.ms-office' || 'application/msword'): { // catdoc $text = shell_exec(escapeshellcmd($catDocPath . ' ' . $filePathEscape)); break; } } if (empty($text)) { continue; } $text = strip_tags($text); $givenEncode = mb_detect_encoding($text); // UTF-8 $text = $givenEncode ? iconv($givenEncode, 'UTF-8', $text) : mb_convert_encoding($text, 'UTF-8'); } catch (Exception $e) { echo $e->getMessage() . PHP_EOL; continue; } $xmlWriter->startElement('sphinx:document'); // $folder->getBasename() - . // $xmlWriter->writeAttribute('id', $folder->getBasename()); $xmlWriter->startElement('content'); $xmlWriter->writeCData($text); $xmlWriter->endElement(); // content $xmlWriter->endElement(); // field } } $xmlWriter->endElement(); $xml = $xmlWriter->outputMemory(); $tidy = tidy_repair_string($xml, array( 'output-xml' => true, 'input-xml' => true ), 'utf8'); echo $tidy;
Source: https://habr.com/ru/post/131089/
All Articles