diff --git a/htdocs/includes/odtphp/odf.php b/htdocs/includes/odtphp/odf.php index 47226a60829..e3160b880f9 100644 --- a/htdocs/includes/odtphp/odf.php +++ b/htdocs/includes/odtphp/odf.php @@ -18,6 +18,7 @@ class OdfException extends Exception * @copyright 2010-2015 - Laurent Destailleur - eldy@users.sourceforge.net * @copyright 2010 - Vikas Mahajan - http://vikasmahajan.wordpress.com * @copyright 2012 - Stephen Larroque - lrq3000@gmail.com + * @copyright 2023 - Thomas Negre - contact@open-dsi.fr * @license https://www.gnu.org/copyleft/gpl.html GPL License * @version 1.5.0 */ @@ -46,6 +47,9 @@ class Odf public $userdefined=array(); const PIXEL_TO_CM = 0.026458333; + const FIND_TAGS_REGEX = '/<([A-Za-z0-9]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(.*)<\/\1>))/'; + const FIND_ENCODED_TAGS_REGEX = '/<([A-Za-z]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(.*)<\/\1>))/'; + /** * Class constructor @@ -161,12 +165,16 @@ class Odf */ public function convertVarToOdf($value, $encode = true, $charset = 'ISO-8859') { - $value = $encode ? htmlspecialchars($value) : $value; - $value = ($charset == 'ISO-8859') ? utf8_encode($value) : $value; + $value = html_entity_decode($value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML401); + + // fix breaklines. + $value = preg_replace('//', "
", $value); $convertedValue = $value; // Check if the value includes html tags if ($this->_hasHtmlTag($value) === true) { + $value = strip_tags($value, '
'); + // Default styles for strong/b, i/em, u, s, sub & sup $automaticStyles = array( '', @@ -180,7 +188,7 @@ class Odf $customStyles = array(); $fontDeclarations = array(); - $convertedValue = $this->_replaceHtmlWithOdtTag($this->_getDataFromHtml($value), $customStyles, $fontDeclarations); + $convertedValue = $this->_replaceHtmlWithOdtTag($this->_getDataFromHtml($value), $customStyles, $fontDeclarations, $encode, $charset); foreach ($customStyles as $key => $val) { array_push($automaticStyles, '' . $val . ''); @@ -204,21 +212,23 @@ class Odf } $this->contentXml = str_replace('', $fonts . '', $this->contentXml); } else { - $convertedValue = preg_replace('/(\r\n|\r|\n)/i', "", $value); + $convertedValue = $this->encode_chars($convertedValue, $encode, $charset); + $convertedValue = preg_replace('/(\r\n|\r|\n)/i', "", $convertedValue); } return $convertedValue; } /** - * Replaces html tags in with odt tags and returns an odt string - * - * @param array $tags An array with html tags generated by the getDataFromHtml() function - * @param array $customStyles An array of style defenitions that should be included inside the odt file - * @param array $fontDeclarations An array of font declarations that should be included inside the odt file - * @return string - */ - private function _replaceHtmlWithOdtTag($tags, &$customStyles, &$fontDeclarations) + * Replaces html tags in with odt tags and returns an odt string. Encodes and converts inner text. + * @param array $tags An array with html tags generated by the getDataFromHtml() function + * @param array $customStyles An array of style defenitions that should be included inside the odt file + * @param array $fontDeclarations An array of font declarations that should be included inside the odt file + * @param bool $encode If true, special XML characters are encoded + * @param string $charset Charset. See encode_chars() + * @return string + */ + private function _replaceHtmlWithOdtTag($tags, &$customStyles, &$fontDeclarations, $encode = false, $charset = '') { if ($customStyles == null) $customStyles = array(); if ($fontDeclarations == null) $fontDeclarations = array(); @@ -228,7 +238,8 @@ class Odf foreach ((array) $tags as $tag) { // Check if the current item is a tag or just plain text if (isset($tag['text'])) { - $odtResult .= $tag['text']; + $text = $this->encode_chars($tag['text'], $encode, $charset); + $odtResult .= $text; } elseif (isset($tag['name'])) { switch ($tag['name']) { case 'br': @@ -236,23 +247,23 @@ class Odf break; case 'strong': case 'b': - $odtResult .= '' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . ''; + $odtResult .= '' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $this->encode_chars($tag['innerText'], $encode, $charset)) . ''; break; case 'i': case 'em': - $odtResult .= '' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . ''; + $odtResult .= '' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $this->encode_chars($tag['innerText'], $encode, $charset)) . ''; break; case 'u': - $odtResult .= '' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . ''; + $odtResult .= '' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $this->encode_chars($tag['innerText'], $encode, $charset)) . ''; break; case 's': - $odtResult .= '' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . ''; + $odtResult .= '' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $this->encode_chars($tag['innerText'], $encode, $charset)) . ''; break; case 'sub': - $odtResult .= '' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . ''; + $odtResult .= '' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $this->encode_chars($tag['innerText'], $encode, $charset)) . ''; break; case 'sup': - $odtResult .= '' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . ''; + $odtResult .= '' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $this->encode_chars($tag['innerText'], $encode, $charset)) . ''; break; case 'span': if (isset($tag['attributes']['style'])) { @@ -287,9 +298,9 @@ class Odf } if (strlen($odtStyles) > 0) { // Generate a unique id for the style (using microtime and random because some CPUs are really fast...) - $key = floatval(str_replace('.', '', microtime(true)))+rand(0, 10); + $key = floatval(str_replace('.', '', microtime(true))) + rand(0, 10); $customStyles[$key] = $odtStyles; - $odtResult .= '' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . ''; + $odtResult .= '' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $this->encode_chars($tag['innerText'], $encode, $charset)) . ''; } } break; @@ -303,15 +314,30 @@ class Odf } /** - * Checks if the given text is a html string - * @param string $text The text to check - * @return bool + * Correctly encode chars + * @param string $text The text to encode or not + * @param bool $encode If true, special XML characters are encoded + * @param string $charset Charset + * @return string The converted text + * @see self::convertVarToOdf() */ - private function _isHtmlTag($text) + private function encode_chars($text, $encode = false, $charset = '') { - return preg_match('/<([A-Za-z]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(.*)<\/\1>))/', $text); + $newtext = $encode ? htmlspecialchars($text, ENT_QUOTES | ENT_XML1) : $text; + $newtext = ($charset == 'ISO-8859') ? utf8_encode($newtext) : $newtext; + return $newtext; } + /** + * Checks if the given text is a html string + * @param string $text The text to check + * @return bool + */ + private function _isHtmlTag($text) + { + return preg_match(self::FIND_TAGS_REGEX, $text); + } + /** * Checks if the given text includes a html string * @param string $text The text to check @@ -319,7 +345,7 @@ class Odf */ private function _hasHtmlTag($text) { - $result = preg_match_all('/<([A-Za-z]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(.*)<\/\1>))/', $text); + $result = preg_match_all(self::FIND_TAGS_REGEX, $text); return is_numeric($result) && $result > 0; } @@ -334,9 +360,8 @@ class Odf $tempHtml = $html; while (strlen($tempHtml) > 0) { - $matches = array(); // Check if the string includes a html tag - if (preg_match_all('/<([A-Za-z]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(.*)<\/\1>))/', $tempHtml, $matches)) { + if (preg_match_all(self::FIND_TAGS_REGEX, $tempHtml, $matches)) { $tagOffset = strpos($tempHtml, $matches[0][0]); // Check if the string starts with the html tag if ($tagOffset > 0) { @@ -348,13 +373,12 @@ class Odf $tempHtml = substr($tempHtml, $tagOffset); } // Extract the attribute data from the html tag - $explodedAttributes = array(); preg_match_all('/([0-9A-Za-z]+(?:="[0-9A-Za-z\:\-\s\,\;\#]*")?)+/', $matches[2][0], $explodedAttributes); $explodedAttributes = array_filter($explodedAttributes[0]); $attributes = array(); // Store each attribute with its name in the $attributes array $explodedAttributesCount = count($explodedAttributes); - for ($i=0; $i<$explodedAttributesCount; $i++) { + for ($i = 0; $i < $explodedAttributesCount; $i++) { $attribute = trim($explodedAttributes[$i]); // Check if the attribute has a value (like style="") or has no value (like required) if (strpos($attribute, '=') !== false) { @@ -368,7 +392,7 @@ class Odf // Split the style properties and store them in an array $explodedStyles = explode(';', $attrValue); $explodedStylesCount = count($explodedStyles); - for ($n=0; $n<$explodedStylesCount; $n++) { + for ($n = 0; $n < $explodedStylesCount; $n++) { $splitStyle = explode(':', $explodedStyles[$n]); $attributes[$attrName][trim($splitStyle[0])] = trim($splitStyle[1]); } diff --git a/test/phpunit/ODFTest.php b/test/phpunit/ODFTest.php new file mode 100644 index 00000000000..fdf31140b11 --- /dev/null +++ b/test/phpunit/ODFTest.php @@ -0,0 +1,379 @@ + + * Copyright (C) 2023 - Thomas Negre - contact@open-dsi.fr + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** + * \file test/unit/ODFTest.php + * \ingroup odf + * \brief PHPUnit test for odf class. + */ + +global $conf,$user,$langs,$db; +//define('TEST_DB_FORCE_TYPE','mysql'); // This is to force using mysql driver +//require_once 'PHPUnit/Autoload.php'; +require_once dirname(__FILE__).'/../../htdocs/master.inc.php'; +require_once dirname(__FILE__).'/../../htdocs/includes/odtphp/odf.php'; + +if (empty($user->id)) { + print "Load permissions for admin user nb 1\n"; + $user->fetch(1); + $user->getrights(); +} +$conf->global->MAIN_DISABLE_ALL_MAILS=1; + +$langs->load("main"); + + +/** + * Class for PHPUnit tests + * + * @backupGlobals disabled + * @backupStaticAttributes enabled + * @remarks backupGlobals must be disabled to have db,conf,user and lang not erased. + */ +class ODFTest extends PHPUnit\Framework\TestCase +{ + protected $savconf; + protected $savuser; + protected $savlangs; + protected $savdb; + + /** + * Constructor + * We save global variables into local variables + * + * @return BOMTest + */ + public function __construct() + { + parent::__construct(); + + //$this->sharedFixture + global $conf,$user,$langs,$db; + $this->savconf=$conf; + $this->savuser=$user; + $this->savlangs=$langs; + $this->savdb=$db; + + print __METHOD__." db->type=".$db->type." user->id=".$user->id; + //print " - db ".$db->db; + print "\n"; + } + + /** + * setUpBeforeClass + * + * @return void + */ + public static function setUpBeforeClass() + { + global $conf,$user,$langs,$db; + $db->begin(); // This is to have all actions inside a transaction even if test launched without suite. + + print __METHOD__."\n"; + } + + /** + * tearDownAfterClass + * + * @return void + */ + public static function tearDownAfterClass() + { + global $conf,$user,$langs,$db; + $db->rollback(); + + print __METHOD__."\n"; + } + + /** + * Init phpunit tests + * + * @return void + */ + protected function setUp() + { + global $conf,$user,$langs,$db; + $conf=$this->savconf; + $user=$this->savuser; + $langs=$this->savlangs; + $db=$this->savdb; + + print __METHOD__."\n"; + } + + /** + * End phpunit tests + * + * @return void + */ + protected function tearDown() + { + print __METHOD__."\n"; + } + + /** + * test ODF convertVarToOdf + * + * @return int + */ + public function testODFconvertVarToOdf() + { + global $conf,$user,$langs,$db; + $conf=$this->savconf; + $user=$this->savuser; + $langs=$this->savlangs; + $db=$this->savdb; + + // we test using template_invoice, it does not matter, we just need a valid odt. + $filename = '../../htdocs/install/doctemplates/invoices/template_invoice.odt'; + $config = [ + 'PATH_TO_TMP' => "/tmp", + 'ZIP_PROXY' => "PclZipProxy", + 'DELIMITER_LEFT' => "{", + 'DELIMITER_RIGHT' => "}", + ]; + + $to_test = [ + /** No HTML **/ + // Simple strings + 1 => [ + 'to_convert' => 'Simple string', + 'encode' => true, + 'charset' => null, + 'expected' => 'Simple string', + ], + 2 => [ + 'to_convert' => 'Simple string', + 'encode' => false, + 'charset' => null, + 'expected' => 'Simple string', + ], + 3 => [ + 'to_convert' => "Simple string\nwith line break", + 'encode' => true, + 'charset' => null, + 'expected' => "Simple stringwith line break", + ], + 4 => [ + 'to_convert' => "Simple string\nwith line break", + 'encode' => false, + 'charset' => null, + 'expected' => "Simple stringwith line break", + ], + // Special chars + 5 => [ + 'to_convert' => 'One&two', + 'encode' => true, + 'charset' => null, + 'expected' => 'One&two', + ], + 6 => [ + 'to_convert' => 'One&two', + 'encode' => false, + 'charset' => null, + 'expected' => 'One&two', + ], + 7 => [ + 'to_convert' => "/a&él'èàüöç€Ğ~<>", + 'encode' => true, + 'charset' => null, + 'expected' => utf8_encode("/a&él'èàüöç€Ğ~<>"), + ], + 8 => [ + 'to_convert' => "/a&él'èàüöç€Ğ~<>", + 'encode' => false, + 'charset' => null, + 'expected' => utf8_encode("/a&él'èàüöç€Ğ~<>"), + ], + // special chars with non-default charset + 9 => [ + 'to_convert' => "/a&él'èàüöç€Ğ~<>", + 'encode' => true, + 'charset' => 'UTF-16', + 'expected' => "/a&él'èàüöç€Ğ~<>", + ], + 10 => [ + 'to_convert' => "/a&él'èàüöç€Ğ~<>", + 'encode' => false, + 'charset' => 'UTF-16', // When the charset differs from ISO-8859 string is not converted. + 'expected' => "/a&él'èàüöç€Ğ~<>", + ], + 11 => [ + 'to_convert' => "Greater > than", + 'encode' => true, + 'charset' => null, + 'expected' => utf8_encode("Greater > than"), + ], + 12 => [ + 'to_convert' => "Greater > than", + 'encode' => false, + 'charset' => null, + 'expected' => utf8_encode("Greater > than"), + ], + 13 => [ + 'to_convert' => "Smaller < than", + 'encode' => true, + 'charset' => null, + 'expected' => utf8_encode("Smaller < than"), + ], + 14 => [ + 'to_convert' => "Smaller < than", + 'encode' => false, + 'charset' => null, + 'expected' => utf8_encode("Smaller < than"), + ], + /** HTML **/ + // break lines + 15 => [ + 'to_convert' => "Break
line", + 'encode' => true, + 'charset' => null, + 'expected' => utf8_encode("Breakline"), + ], + 16 => [ + 'to_convert' => "Break
line", + 'encode' => false, + 'charset' => null, + 'expected' => utf8_encode("Breakline"), + ], + 17 => [ + 'to_convert' => "Break
line", + 'encode' => true, + 'charset' => null, + 'expected' => utf8_encode("Breakline"), + ], + 18 => [ + 'to_convert' => "Break
line", + 'encode' => false, + 'charset' => null, + 'expected' => utf8_encode("Breakline"), + ], + // HTML tags + 19 => [ + 'to_convert' => "text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l'", + 'encode' => false, + 'charset' => 'UTF-8', + 'expected' => 'text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l\'', + ], + 20 => [ + 'to_convert' => "text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l'", + 'encode' => true, + 'charset' => 'UTF-8', + 'expected' => 'text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l'', + ], + 21 => [ + 'to_convert' => "text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l'", + 'encode' => false, + 'charset' => null, + 'expected' => utf8_encode('text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l\''), + ], + 22 => [ + 'to_convert' => "text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l'", + 'encode' => true, + 'charset' => null, + 'expected' => utf8_encode('text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l''), + ], + 23 => [ + 'to_convert' => "text with intricatedtags", + 'encode' => true, + 'charset' => null, + 'expected' => utf8_encode('text with intricatedtags'), + ], + + // One can also pass html-encoded string to the method + 24 => [ + 'to_convert' => 'One&two', + 'encode' => true, + 'charset' => null, + 'expected' => 'One&two', + ], + 25 => [ + 'to_convert' => "text with <strong>strong, </strong><em>emphasis</em> and <u>underlined</u> words with <i>it@lic sp&ciàlchärs éè l'</i>", + 'encode' => false, + 'charset' => 'UTF-8', + 'expected' => 'text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l\'', + ], + 26 => [ + 'to_convert' => "text with <strong>strong, </strong><em>emphasis</em> and <u>underlined</u> words with <i>it@lic sp&ciàlchärs éè l'</i>", + 'encode' => true, + 'charset' => 'UTF-8', + 'expected' => 'text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l'', + ], + 27 => [ + 'to_convert' => "text with <strong>strong, </strong><em>emphasis</em> and <u>underlined</u> words with <i>it@lic sp&ciàlchärs éè l'</i>", + 'encode' => false, + 'charset' => null, + 'expected' => utf8_encode('text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l\''), + ], + 28 => [ + 'to_convert' => "text with <strong>strong, </strong><em>emphasis</em> and <u>underlined</u> words with <i>it@lic sp&ciàlchärs éè l'</i>", + 'encode' => true, + 'charset' => null, + 'expected' => utf8_encode('text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l''), + ], + + // // TODO custom styles are not tested for now : the custom style have a custom ID based on time. Not random, but hard to mock or predict. generated in _replaceHtmlWithOdtTag() case 'span'. + // [ + // 'to_convert' => '123 trucmachin > truc < troc > tracbla bla', + // 'encode' => true, + // 'charset' => 'UTF-8', + // 'expected' => "123 trucmachin > truc < troc > tracbla bla'", + // ], + + /* Tests that can evolve */ + // Following tests reflect the current behavior. They may evolve if the method behavior changes. + + // The method removes hyperlinks and tags that are not dealt with. + 29 => [ + 'to_convert' => '123 trucmachin > truc < troc > tracbla bla', + 'encode' => true, + 'charset' => null, + 'expected' => "123 trucmachin > truc < troc > tracbla bla", + ], + 30 => [ + 'to_convert' => '123

Title

bla', + 'encode' => true, + 'charset' => null, + 'expected' => "123 Title bla", + ], + // HTML should not take \n into account, but only
. + 31 => [ + 'to_convert' => "text with strong text , a line\nbreak and underlined words with it@lic sp&ciàlchärs éè l'", + 'encode' => false, + 'charset' => 'UTF-8', + 'expected' => 'text with strong text , a line'."\n".'break and underlined words with it@lic sp&ciàlchärs éè l\'', + ], + ]; + + $odf=new Odf($filename, array()); + if (is_object($odf)) $result = 1; // Just to test + + foreach ($to_test as $case) { + if ($case['charset'] !== null) { + $res = $odf->convertVarToOdf($case['to_convert'], $case['encode'], $case['charset']); + } else { + $res = $odf->convertVarToOdf($case['to_convert'], $case['encode']); + } + $this->assertEquals($res, $case['expected']); + } + + print __METHOD__." result=".$result."\n"; + + return $result; + } +}