.

KGRKJGETMRETU895U-589TY5MIGM5JGB5SDFESFREWTGR54TY
Server : Apache/2.4.62
System : FreeBSD fbsdweb2.web.rcn.net 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64
User : www ( 80)
PHP Version : 8.3.8
Disable Function : NONE
Directory : /domains/irtiweb/CATS/lib/
Upload File :
Current File : /domains/irtiweb/CATS/lib/DocumentToText.php
<?php
/**
 * CATS
 * Document to Text Conversion Library
 *
 * Copyright (C) 2005 - 2007 Cognizo Technologies, Inc.
 *
 *
 * The contents of this file are subject to the CATS Public License
 * Version 1.1a (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.catsone.com/.
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 * License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is "CATS Standard Edition".
 *
 * The Initial Developer of the Original Code is Cognizo Technologies, Inc.
 * Portions created by the Initial Developer are Copyright (C) 2005 - 2007
 * (or from the year in which this file was created to the year 2007) by
 * Cognizo Technologies, Inc. All Rights Reserved.
 *
 *
 * @package    CATS
 * @subpackage Library
 * @copyright Copyright (C) 2005 - 2007 Cognizo Technologies, Inc.
 * @version    $Id: DocumentToText.php 3587 2007-11-13 03:55:57Z will $
 */

include_once('./lib/SystemUtility.php');
include_once('./lib/FileUtility.php');

/**
 *	Document to Text Conversion Library
 *	@package    CATS
 *	@subpackage Library
 */
class DocumentToText
{
    private $_linesArray = array();
    private $_linesString = '';
    private $_rawOutput = '';
    private $_fileName = '';
    private $_returnCode = -1;
    private $_isError = false;
    private $_error = '';


    /**
     * Returns a document type based on its file extension and content type.
     * This is simply a wrapper for FileUtility::getDocumentType().
     *
     * @param string Document file name with extension.
     * @param string MIME content type.
     * @return flag Document type flag.
     */
    public function getDocumentType($fileName, $contentType = false)
    {
        return FileUtility::getDocumentType($fileName, $contentType);
    }

    /**
     * Attempts to convert a document document to plain text.
     *
     * @param string file name
     * @param flag document type
     * @return boolean True if successful; false otherwise.ful
     */
    public function convert($fileName, $documentType)
    {
        /* (Re?)initialize variables. */
        $this->_linesArray  = array();
        $this->_linesString = '';
        $this->_rawOutput   = '';
        $this->_fileName    = $fileName;

        /* If we are trying to parse a DOC file, is it really a DOC file or is
         * it an RTF file?
         */
        if ($documentType == DOCUMENT_TYPE_DOC)
        {
            $handle = @fopen(realpath($fileName), 'r');
            if ($handle)
            {
                $header = fread($handle, 5);
                fclose($handle);
                
                if ($header == '{\rtf')
                {
                    $documentType = DOCUMENT_TYPE_RTF;
                }
            }
        }
        
        /* Find the absolute path to the filename and escape it for use in a
         * system command.
         */
        $escapedFilename = escapeshellarg(realpath($fileName));

        /* Use different methods to extract text depending on the type of document. */
        switch ($documentType)
        {
            case DOCUMENT_TYPE_DOC:
                if (ANTIWORD_PATH == '')
                {
                    $this->_setError('The DOC format has not been configured.');
                    return false;
                }

                $nativeEncoding = 'ISO-8859-1';
                $command = '"'. ANTIWORD_PATH . '" -m ' . ANTIWORD_MAP . ' '
                    . $escapedFilename;
                break;

            case DOCUMENT_TYPE_PDF:
                if (PDFTOTEXT_PATH == '')
                {
                    $this->_setError('The PDF format has not been configured.');
                    return false;
                }

                $nativeEncoding = 'ISO-8859-1';
                $convertEncoding = false;
                $command = '"'. PDFTOTEXT_PATH . '" -layout ' . $escapedFilename . ' -';
                break;

            case DOCUMENT_TYPE_HTML:
                if (HTML2TEXT_PATH == '')
                {
                    $this->_setError('The HTML format has not been configured.');
                    return false;
                }

                $nativeEncoding = 'ISO-8859-1';
                $convertEncoding = false;
                
                if (SystemUtility::isWindows())
                {
                    $command = 'TYPE ' . $escapedFilename . ' | "'. HTML2TEXT_PATH . '" -nobs ';
                }
                else
                {
                    $command = '"'. HTML2TEXT_PATH . '" -nobs ' . $escapedFilename;
                }                
                break;

            case DOCUMENT_TYPE_TEXT:
                return $this->_readTextFile($fileName);
                break;

            case DOCUMENT_TYPE_RTF;
                $this->_rawOutput = $this->rtf2text($fileName);
                if ($this->_rawOutput == null)
                {
                    return false;
                }
                $this->_linesString = $this->_rawOutput;
                
                return true;                
                break;

            case DOCUMENT_TYPE_ODT:
                $this->_rawOutput = $this->odt2text($filename);
                if ( $this->_rawOutput == null )
                {
                    return false;
                }
                $this->_linesArray = explode("\n", $this->_rawOutput);
                $this->_linesString = $this->_rawOutput;

                return true;
                break;

            case DOCUMENT_TYPE_DOCX:
                $this->_rawOutput = $this->docx2text($fileName);
                if ($this->_rawOutput == null)
                {
                    return false;
                }
                $this->_linesArray = explode("\n", $this->_rawOutput);
                $this->_linesString = $this->_rawOutput;

                return true;
                break;

            case DOCUMENT_TYPE_UNKNOWN:
            default:
                $this->_setError('This file format is unknown format and is not yet supported by CATS.');
                return false;
                break;
        }

        /* Run the text converter. */
        $commandResult = $this->_executeCommand($command);

        /* Store the return code for getReturnCode(). */
        $this->_returnCode = $commandResult['returnCode'];

        /* Store the raw output for getRawOutput(). */
        
        $commandResult['output'] = array_map(
            'rtrim', $commandResult['output']
        );
        $this->_rawOutput = implode("\n", $commandResult['output']);

        /* Fix encoding issues. */
        if ($nativeEncoding == 'ISO-8859-1' && function_exists('iconv'))
        {   
            $this->_rawOutput = iconv(
                $nativeEncoding, 'UTF-8', $this->_rawOutput
            );
        }

        /* If command returned non-zero or output is not an array, assume
         * failure.
         */
        if ($commandResult['returnCode'] != 0 ||
            !is_array($commandResult['output']))
        {
            return false;
        }

        /* Store the output in string and array form. */
        $this->_linesArray  = $commandResult['output'];
        $this->_linesString = $this->_rawOutput;

        return true;
    }

    /**
     * Returns the word document in plain-text format. Make sure to check
     * the return value of the convert() method to make sure conversion was
     * actually preformed.
     *
     * @return string document in plain-text
     */
    public function getString()
    {
        return $this->_linesString;
    }

    /**
     * Returns the word document in plain-text format as an array of line
     * number => line. Make sure to check the return value of the convert()
     * method to make sure conversion was actually preformed.
     *
     * @return array document lines
     */
    public function getArray()
    {
        return $this->_linesArray;
    }

    /**
     * Returns the return code from the execution of the document converter for
     * the last conversion. This is useful for error handling / debugging.
     *
     * @return integer converter return code
     */
    public function getReturnCode()
    {
        return $this->_returnCode;
    }

    /**
     * Returns a string containing the raw output from the document converter
     * for the last conversion. This is useful for error handling / debugging.
     *
     * @return string raw converter output
     */
    public function getRawOutput()
    {
        return $this->_rawOutput;
    }

    /**
     * Returns true if an error occurred trying to convert the document.
     *
     * @return boolean error occurred
     */
    public function isError()
    {
        return $this->_isError;
    }

    /**
     * Returns the current error message, or '' if no errors have occurred.
     *
     * @return string error message
     */
    public function getError()
    {
        return $this->_error;
    }


    /**
     * Triggers an error message.
     *
     * @param string error message
     * @param integer converter return code
     * @return void
     */
    private function _setError($errorMessage, $returnCode = 1)
    {
        $this->_rawOutput = '';
        $this->_linesArray  = array();
        $this->_linesString = '';
        $this->_returnCode = $returnCode;
        $this->_isError = true;
        $this->_error = $errorMessage;
    }

    /**
     * Loads a text file as if it was "converted" to text.
     *
     * @param string path to text file
     * @return boolean True if successful; false otherwise.
     */
    private function _readTextFile($fileName)
    {
        $contents = @file($fileName);
        if ($contents === false || !is_array($contents))
        {
            $this->_setError('Failed to parse text file.');
            return false;
        }

        $contents = array_map('rtrim', $contents);
        
        $this->_rawOutput = implode("\n", $contents);
        $this->_linesArray  = $contents;
        $this->_linesString = $this->_rawOutput;
        $this->_returnCode = 0;
        return true;
    }

    /**
     * Executes a shell command in a platform-independent way and returns the
     * results in an array containing the exact system command executed, the
     * raw output of that command, and the command's return code.
     *
     * @param string command to execute
     * @return array command results
     */
    private function _executeCommand($command)
    {
        /* Running on Windows? */
        if (SystemUtility::isWindows())
        {
            /* Generate a random temp file name. */
            $tempFile = sprintf(
                '%s/%s.txt',
                realpath(CATS_TEMP_DIR),
                FileUtility::makeRandomFilename()
            );

            /* Create a new COM Windows Scripting Host Shell object. */
            $WSHShell = new COM('WScript.Shell');

            /* Build the command to execute. */
            $command = sprintf(
                'cmd.exe /C "%s > "%s""', $command, $tempFile
            );

            /* Execute the command via the Windows Scripting Host Shell. */
            $returnCode = $WSHShell->Run($command, 0, true);

            /* Grab the contents of the temporary file and remove it. */
            $output = file($tempFile);
            @unlink($tempFile);
        }
        else
        {
            @exec($command, $output, $returnCode);
        }

        return array(
            'command'    => $command,
            'output'     => $output,
            'returnCode' => $returnCode
        );
    }

    private function odt2text($filename)
    {
        return $this->readZippedXML($filename, "content.xml");
    }

    private function docx2text($filename)
    {
        return $this->readZippedXML($filename, "word/document.xml");
    }

    private function readZippedXML($archiveFile, $dataFile)
    {
        // Create new ZIP archive
        $zip = new ZipArchive;

        // Open received archive file
        if (true === $zip->open($archiveFile))
        {
            // If done, search for the data file in the archive
            if (($index = $zip->locateName($dataFile)) !== false)
            {
                // If found, read it to the string
                $data = $zip->getFromIndex($index);
                // Close archive file
                $zip->close();
                // Load XML from a string
                // Skip errors and warnings
                $xml = DOMDocument::loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
                $raw_text = $xml->saveXML();
                // We need to add a space where end-of-line and end-of-paragraphs present 
                $raw_text_patched = str_replace(
                        array('<w:br/>', '</w:p>', '<text:line-break', '<text:p'),
                        array("\n<w:br/>", "\n</w:p>", "\n<text:line-break", "\n<text:p"), $raw_text);
                // Return data without XML formatting tags
                return utf8_encode(strip_tags($raw_text_patched));
            }
            $zip->close();
        }

        // In case of failure return empty string
        return "";
    }

    private function rtf2text($filename)
    {
        global $text;
        global $j;
        global $len;
        if(!is_readable($filename))
        {
            return null;
        }
        $text = file_get_contents($filename);
        if(!$text)
        {
            return null;
        }
        // we'll try to fix up the parts of the rtf as best we can
        // clean up the file a little to simplify parsing
        $text = str_replace("\r", ' ', $text); // returns
        $text = str_replace("\n", ' ', $text); // new lines
        $text = str_replace('  ', ' ', $text); // double spaces
        $text = str_replace('  ', ' ', $text); // double spaces
        $text = str_replace('  ', ' ', $text); // double spaces
        $text = str_replace('  ', ' ', $text); // double spaces
        $text = str_replace('} {', '}{', $text); // embedded spaces
        // skip over the heading stuff
        $j = strpos($text, '{', 1); // skip ahead to the first part of the header

        $loc = 1;
        $t = "";

        $ansa = "";
        $len = strlen($text);
        $this->getpgraph(); // skip by the first paragrap

        while ($j < $len)
        {
            $c = substr($text, $j, 1);
            if ($c == "\\")
            {
                // have a tag
                $tag = $this->gettag();
                if (strlen($tag) > 0)
                {
                    // process known tags
                    switch ($tag)
                    {
                        case 'par':
                            $ansa.="\r\n";
                            break;
                        // ad a list of common tags
                        // parameter tags
                        case 'spriority1':
                        case 'fprq2':
                        case 'author':
                        case 'operator':
                        case 'sqformat':
                        case 'company':
                        case 'xmlns1':
                        case 'wgrffmtfilter':
                        case 'pnhang':
                        case 'themedata':
                        case 'colorschememapping':
                            $tt = $this->gettag();
                            break;
                        case '*':
                        case 'info':
                        case 'stylesheet':
                            // gets to end of paragraph
                            $j--;
                            $this->getpgraph();
                        default:
                        // ignore the tag
                    }
                }
            }
            else
            {
                $ansa.=$c;
            }
            $j++;
        }
        $ansa = str_replace('{', '', $ansa);
        $ansa = str_replace('}', '', $ansa);
        return utf8_encode($ansa);
    }

    private function getpgraph()
    {
        // if the first char after a tag is { then throw out the entire paragraph
        // this has to be nested
        global $text;
        global $j;
        global $len;
        $nest = 0;
        while (true)
        {
            $j++;
            if ($j >= $len)
                break;
            if (substr($text, $j, 1) == '}')
            {
                if ($nest == 0)
                    return;
                $nest--;
            }
            if (substr($text, $j, 1) == '{')
            {
                $nest++;
            }
        }
        return;
    }

    private function gettag()
    {
        // gets the text following the / character or gets the param if it there
        global $text;
        global $j;
        global $len;
        $tag = '';
        while (true)
        {
            $j++;
            if ($j >= $len)
                break;
            $c = substr($text, $j, 1);
            if ($c == ' ')
                break;
            if ($c == ';')
                break;
            if ($c == '}')
                break;
            if ($c == "\\")
            {
                $j--;
                break;
            }
            if ($c == "{")
            {
                //getpgraph();
                break;
            }
            if ((($c >= '0') && ($c <= '9')) || (($c >= 'a') && ($c <= 'z')) || (($c >= 'A') && ($c <= 'Z')) || $c == "'" || $c == "-" || $c == "*") {
                $tag = $tag . $c;
            } else {
                // end of tag
                $j--;
                break;
            }
        }
        return $tag;
    }

}
?>
Anon7 - 2021