Moved the HtmlToOdtConverter class into the odf class

This commit is contained in:
Tim Otte 2020-03-04 08:54:49 +01:00
parent 667f00872c
commit 6b3fdfbcd8
2 changed files with 210 additions and 255 deletions

View File

@ -1,247 +0,0 @@
<?php
// Learn more about this regex pattern: https://regexr.com/4vi60
define('HTML_REGEX_PATTERN', '/<([A-Za-z]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(.*)<\/\1>))/');
class HtmlToOdtConverter
{
/**
* Converts a string with html inside into an odt compatible string
* @param string $htmlText The text to convert
* @return array
*/
public static function htmlToOdt($htmlText)
{
/*
Default styles:
<strong> = <style:style style:name="boldText" style:family="text"><style:text-properties fo:font-weight="bold" style:font-weight-asian="bold" style:font-weight-complex="bold" /></style:style>
<i> = <style:style style:name="italicText" style:family="text"><style:text-properties fo:font-style="italic" style:font-style-asian="italic" style:font-style-complex="italic" /></style:style>
<u> = <style:style style:name="underlineText" style:family="text"><style:text-properties style:text-underline-style="solid" style:text-underline-width="auto" style:text-underline-color="font-color" /></style:style>
<s> = <style:style style:name="strikethroughText" style:family="text"><style:text-properties style:text-line-through-style="solid" style:text-line-through-type="single" /></style:style>
<sub> = <style:style style:name="subText" style:family="text"><style:text-properties style:text-position="sub 58%" /></style:style>
<sup> = <style:style style:name="supText" style:family="text"><style:text-properties style:text-position="super 58%" /></style:style>
Custom styles:
<style:style style:name="customStyleN" style:family="text"> [Content] </style:style>
font-size = <style:text-properties fo:font-size="60pt" style:font-size-asian="60pt" style:font-size-complex="60pt" />
font-family = <style:text-properties style:font-name="Courier New" />
Additionally, a font face has to be added to the font-face-decls. An example for a font face declaration: <style:font-face style:name="Courier New" svg:font-family="'Courier New'" />
color = <style:text-properties fo:color="#0000ff" />
*/
//TODO: Add font names to odt header
$automaticStyles = array(
'<style:style style:name="boldText" style:family="text"><style:text-properties fo:font-weight="bold" style:font-weight-asian="bold" style:font-weight-complex="bold" /></style:style>',
'<style:style style:name="italicText" style:family="text"><style:text-properties fo:font-style="italic" style:font-style-asian="italic" style:font-style-complex="italic" /></style:style>',
'<style:style style:name="underlineText" style:family="text"><style:text-properties style:text-underline-style="solid" style:text-underline-width="auto" style:text-underline-color="font-color" /></style:style>',
'<style:style style:name="strikethroughText" style:family="text"><style:text-properties style:text-line-through-style="solid" style:text-line-through-type="single" /></style:style>',
'<style:style style:name="subText" style:family="text"><style:text-properties style:text-position="sub 58%" /></style:style>',
'<style:style style:name="supText" style:family="text"><style:text-properties style:text-position="super 58%" /></style:style>'
);
$odtText = self::replaceHtmlWithOdtTag(self::getDataFromHtml($htmlText), $customStyles, $fontDeclarations);
foreach ($customStyles as $key => $value) {
array_push($automaticStyles, '<style:style style:name="customStyle' . $key . '" style:family="text">' . $value . '</style:style>');
}
return array(
'automaticStyles' => $automaticStyles,
'content' => $odtText,
'fonts' => $fontDeclarations
);
}
/**
* Replaces html tags in with odt tags and returns an odt string
* @param array $tags An array with html tags generated by the getDataFromHtml() function
* @param array $customStyles An array of style defenitions that should be included inside the odt file
* @param array $fontDeclarations An array of font declarations that should be included inside the odt file
* @return string
*/
private static function replaceHtmlWithOdtTag($tags, &$customStyles, &$fontDeclarations)
{
if ($customStyles == null) $customStyles = array();
if ($fontDeclarations == null) $fontDeclarations = array();
$odtResult = '';
foreach ((array) $tags as $tag) {
// Check if the current item is a tag or just plain text
if (isset($tag['text'])) {
$odtResult .= $tag['text'];
} elseif (isset($tag['name'])) {
switch ($tag['name']) {
case 'br':
$odtResult .= '<text:line-break/>';
break;
case 'strong':
case 'b':
$odtResult .= '<text:span text:style-name="boldText">' . ($tag['children'] != null ? self::replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . '</text:span>';
break;
case 'i':
case 'em':
$odtResult .= '<text:span text:style-name="italicText">' . ($tag['children'] != null ? self::replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . '</text:span>';
break;
case 'u':
$odtResult .= '<text:span text:style-name="underlineText">' . ($tag['children'] != null ? self::replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . '</text:span>';
break;
case 's':
$odtResult .= '<text:span text:style-name="strikethroughText">' . ($tag['children'] != null ? self::replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . '</text:span>';
break;
case 'sub':
$odtResult .= '<text:span text:style-name="subText">' . ($tag['children'] != null ? self::replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . '</text:span>';
break;
case 'sup':
$odtResult .= '<text:span text:style-name="supText">' . ($tag['children'] != null ? self::replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . '</text:span>';
break;
case 'span':
if (isset($tag['attributes']['style'])) {
$odtStyles = '';
foreach ($tag['attributes']['style'] as $styleName => $styleValue) {
switch ($styleName) {
case 'font-family':
$fontName = $styleValue;
if (strpos($fontName, ',') !== false) {
$fontName = explode(',', $fontName)[0];
}
if (!in_array($fontName, $fontDeclarations)) {
array_push($fontDeclarations, $fontName);
}
$odtStyles .= '<style:text-properties style:font-name="' . $fontName . '" />';
break;
case 'font-size':
if (preg_match('/([0-9]+)\s?(px|pt)/', $styleValue, $matches)) {
$fontSize = intval($matches[1]);
if ($matches[2] == 'px') {
$fontSize = round($fontSize * 0.75);
}
$odtStyles .= '<style:text-properties fo:font-size="' . $fontSize . 'pt" style:font-size-asian="' . $fontSize . 'pt" style:font-size-complex="' . $fontSize . 'pt" />';
}
break;
case 'color':
if (preg_match('/#[0-9A-Fa-f]{3}(?:[0-9A-Fa-f]{3})?/', $styleValue)) {
$odtStyles .= '<style:text-properties fo:color="' . $styleValue . '" />';
}
break;
}
}
if (strlen($odtStyles) > 0) {
$key = floatval(str_replace('.', '', microtime(true)))+rand(0, 10);
$customStyles[$key] = $odtStyles;
$odtResult .= '<text:span text:style-name="customStyle' . $key . '">' . ($tag['children'] != null ? self::replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . '</text:span>';
}
}
break;
default:
$odtResult .= self::replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations);
break;
}
}
}
return $odtResult;
}
/**
* Checks if the given text is a html string
* @param string $text The text to check
* @return bool
*/
public static function isHtmlTag($text)
{
return preg_match(HTML_REGEX_PATTERN, $text);
}
/**
* Checks if the given text includes a html string
* @param string $text The text to check
* @return bool
*/
public static function hasHtmlTag($text)
{
$result = preg_match_all(HTML_REGEX_PATTERN, $text);
return is_numeric($result) && $result > 0;
}
/**
* Returns an array of html elements
* @param string $html A string with html tags
* @return array
*/
private static function getDataFromHtml($html)
{
$tags = array();
$tempHtml = $html;
while (strlen($tempHtml) > 0) {
// Check if the string includes a html tag
if (preg_match_all(HTML_REGEX_PATTERN, $tempHtml, $matches)) {
$tagOffset = strpos($tempHtml, $matches[0][0]);
// Check if the string starts with the html tag
if ($tagOffset > 0) {
// Push the text infront of the html tag to the result array
array_push($tags, array(
'text' => substr($tempHtml, 0, $tagOffset)
));
// Remove the text from the string
$tempHtml = substr($tempHtml, $tagOffset);
}
// Extract the attribute data from the html tag
preg_match_all('/([0-9A-Za-z]+(?:="[0-9A-Za-z\:\-\s]*")?)+/', $matches[2][0], $explodedAttributes);
$explodedAttributes = array_filter($explodedAttributes[0]);
$attributes = array();
// Store each attribute with its name in the $attributes array
$explodedAttributesCount = count($explodedAttributes);
for ($i=0; $i<$explodedAttributesCount; $i++) {
$attribute = trim($explodedAttributes[$i]);
// Check if the attribute has a value (like style="") or has no value (like required)
if (strpos($attribute, '=') !== false) {
$splitAttribute = explode('=', $attribute);
$attrName = trim($splitAttribute[0]);
$attrValue = trim(str_replace('"', '', $splitAttribute[1]));
// check if the current attribute is a style attribute
if (strtolower($attrName) == 'style') {
$attributes[$attrName] = array();
if (strpos($attrValue, ';') !== false) {
// Split the style properties and store them in an array
$explodedStyles = explode(';', $attrValue);
$explodedStylesCount = count($explodedStyles);
for ($n=0; $n<$explodedStylesCount; $n++) {
$splitStyle = explode(':', $explodedStyles[$n]);
$attributes[$attrName][trim($splitStyle[0])] = trim($splitStyle[1]);
}
} else {
$splitStyle = explode(':', $attrValue);
$attributes[$attrName][trim($splitStyle[0])] = trim($splitStyle[1]);
}
} else {
// Store the value directly in the $attributes array if this is not the style attribute
$attributes[$attrName] = $attrValue;
}
} else {
$attributes[trim($attribute)] = true;
}
}
// Push the html tag data to the result array
array_push($tags, array(
'name' => $matches[1][0],
'attributes' => $attributes,
'innerText' => strip_tags($matches[3][0]),
'children' => self::hasHtmlTag($matches[3][0]) ? self::getDataFromHtml($matches[3][0]) : null
));
// Remove the processed html tag from the html string
$tempHtml = substr($tempHtml, strlen($matches[0][0]));
} else {
array_push($tags, array(
'text' => $tempHtml
));
$tempHtml = '';
}
}
return $tags;
}
}

View File

@ -1,7 +1,6 @@
<?php
require 'Segment.php';
require_once __DIR__ . '/../../core/class/HtmlToOdtConverter.class.php';
class OdfException extends Exception
{
@ -145,33 +144,236 @@ class Odf
$value = ($charset == 'ISO-8859') ? utf8_encode($value) : $value;
// Check if the value includes html tags
if (HtmlToOdtConverter::hasHtmlTag($value) === true) {
// Convert the value to an odt compatible value
$result = HtmlToOdtConverter::htmlToOdt($value);
if ($this->_hasHtmlTag($value) === true) {
// Default styles for strong/b, i/em, u, s, sub & sup
$automaticStyles = array(
'<style:style style:name="boldText" style:family="text"><style:text-properties fo:font-weight="bold" style:font-weight-asian="bold" style:font-weight-complex="bold" /></style:style>',
'<style:style style:name="italicText" style:family="text"><style:text-properties fo:font-style="italic" style:font-style-asian="italic" style:font-style-complex="italic" /></style:style>',
'<style:style style:name="underlineText" style:family="text"><style:text-properties style:text-underline-style="solid" style:text-underline-width="auto" style:text-underline-color="font-color" /></style:style>',
'<style:style style:name="strikethroughText" style:family="text"><style:text-properties style:text-line-through-style="solid" style:text-line-through-type="single" /></style:style>',
'<style:style style:name="subText" style:family="text"><style:text-properties style:text-position="sub 58%" /></style:style>',
'<style:style style:name="supText" style:family="text"><style:text-properties style:text-position="super 58%" /></style:style>'
);
$this->vars[$tag] = $this->_replaceHtmlWithOdtTag($this->_getDataFromHtml($value), $customStyles, $fontDeclarations);
foreach ($customStyles as $key => $val) {
array_push($automaticStyles, '<style:style style:name="customStyle' . $key . '" style:family="text">' . $val . '</style:style>');
}
// Join the styles and add them to the content xml
$styles = '';
foreach ($result['automaticStyles'] as $style) {
foreach ($automaticStyles as $style) {
if (strpos($this->contentXml, $style) === false) {
$styles .= $style;
}
}
$this->contentXml = str_replace('</office:automatic-styles>', $styles . '</office:automatic-styles>', $this->contentXml);
// Join the font declarations and add them to the content xml
$fonts = '';
foreach ($result['fonts'] as $font) {
foreach ($fontDeclarations as $font) {
if (strpos($this->contentXml, 'style:name="' . $font . '"') === false) {
$fonts .= '<style:font-face style:name="' . $font . '" svg:font-family="\'' . $font . '\'" />';
}
}
$this->contentXml = str_replace('</office:font-face-decls>', $fonts . '</office:font-face-decls>', $this->contentXml);
// Set the var to the converted odt value
$this->vars[$tag] = $result['content'];
}
else $this->vars[$tag] = $value;
return $this;
}
/**
* Replaces html tags in with odt tags and returns an odt string
* @param array $tags An array with html tags generated by the getDataFromHtml() function
* @param array $customStyles An array of style defenitions that should be included inside the odt file
* @param array $fontDeclarations An array of font declarations that should be included inside the odt file
* @return string
*/
private function _replaceHtmlWithOdtTag($tags, &$customStyles, &$fontDeclarations)
{
if ($customStyles == null) $customStyles = array();
if ($fontDeclarations == null) $fontDeclarations = array();
$odtResult = '';
foreach ((array) $tags as $tag) {
// Check if the current item is a tag or just plain text
if (isset($tag['text'])) {
$odtResult .= $tag['text'];
} elseif (isset($tag['name'])) {
switch ($tag['name']) {
case 'br':
$odtResult .= '<text:line-break/>';
break;
case 'strong':
case 'b':
$odtResult .= '<text:span text:style-name="boldText">' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . '</text:span>';
break;
case 'i':
case 'em':
$odtResult .= '<text:span text:style-name="italicText">' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . '</text:span>';
break;
case 'u':
$odtResult .= '<text:span text:style-name="underlineText">' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . '</text:span>';
break;
case 's':
$odtResult .= '<text:span text:style-name="strikethroughText">' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . '</text:span>';
break;
case 'sub':
$odtResult .= '<text:span text:style-name="subText">' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . '</text:span>';
break;
case 'sup':
$odtResult .= '<text:span text:style-name="supText">' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . '</text:span>';
break;
case 'span':
if (isset($tag['attributes']['style'])) {
$odtStyles = '';
foreach ($tag['attributes']['style'] as $styleName => $styleValue) {
switch ($styleName) {
case 'font-family':
$fontName = $styleValue;
if (strpos($fontName, ',') !== false) {
$fontName = explode(',', $fontName)[0];
}
if (!in_array($fontName, $fontDeclarations)) {
array_push($fontDeclarations, $fontName);
}
$odtStyles .= '<style:text-properties style:font-name="' . $fontName . '" />';
break;
case 'font-size':
if (preg_match('/([0-9]+)\s?(px|pt)/', $styleValue, $matches)) {
$fontSize = intval($matches[1]);
if ($matches[2] == 'px') {
$fontSize = round($fontSize * 0.75);
}
$odtStyles .= '<style:text-properties fo:font-size="' . $fontSize . 'pt" style:font-size-asian="' . $fontSize . 'pt" style:font-size-complex="' . $fontSize . 'pt" />';
}
break;
case 'color':
if (preg_match('/#[0-9A-Fa-f]{3}(?:[0-9A-Fa-f]{3})?/', $styleValue)) {
$odtStyles .= '<style:text-properties fo:color="' . $styleValue . '" />';
}
break;
}
}
if (strlen($odtStyles) > 0) {
// Generate a unique id for the style (using microtime and random because some CPUs are really fast...)
$key = floatval(str_replace('.', '', microtime(true)))+rand(0, 10);
$customStyles[$key] = $odtStyles;
$odtResult .= '<text:span text:style-name="customStyle' . $key . '">' . ($tag['children'] != null ? $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations) : $tag['innerText']) . '</text:span>';
}
}
break;
default:
$odtResult .= $this->_replaceHtmlWithOdtTag($tag['children'], $customStyles, $fontDeclarations);
break;
}
}
}
return $odtResult;
}
/**
* Checks if the given text is a html string
* @param string $text The text to check
* @return bool
*/
private function _isHtmlTag($text)
{
return preg_match('/<([A-Za-z]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(.*)<\/\1>))/', $text);
}
/**
* Checks if the given text includes a html string
* @param string $text The text to check
* @return bool
*/
private function _hasHtmlTag($text)
{
$result = preg_match_all('/<([A-Za-z]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(.*)<\/\1>))/', $text);
return is_numeric($result) && $result > 0;
}
/**
* Returns an array of html elements
* @param string $html A string with html tags
* @return array
*/
private function _getDataFromHtml($html)
{
$tags = array();
$tempHtml = $html;
while (strlen($tempHtml) > 0) {
// Check if the string includes a html tag
if (preg_match_all('/<([A-Za-z]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(.*)<\/\1>))/', $tempHtml, $matches)) {
$tagOffset = strpos($tempHtml, $matches[0][0]);
// Check if the string starts with the html tag
if ($tagOffset > 0) {
// Push the text infront of the html tag to the result array
array_push($tags, array(
'text' => substr($tempHtml, 0, $tagOffset)
));
// Remove the text from the string
$tempHtml = substr($tempHtml, $tagOffset);
}
// Extract the attribute data from the html tag
preg_match_all('/([0-9A-Za-z]+(?:="[0-9A-Za-z\:\-\s]*")?)+/', $matches[2][0], $explodedAttributes);
$explodedAttributes = array_filter($explodedAttributes[0]);
$attributes = array();
// Store each attribute with its name in the $attributes array
$explodedAttributesCount = count($explodedAttributes);
for ($i=0; $i<$explodedAttributesCount; $i++) {
$attribute = trim($explodedAttributes[$i]);
// Check if the attribute has a value (like style="") or has no value (like required)
if (strpos($attribute, '=') !== false) {
$splitAttribute = explode('=', $attribute);
$attrName = trim($splitAttribute[0]);
$attrValue = trim(str_replace('"', '', $splitAttribute[1]));
// check if the current attribute is a style attribute
if (strtolower($attrName) == 'style') {
$attributes[$attrName] = array();
if (strpos($attrValue, ';') !== false) {
// Split the style properties and store them in an array
$explodedStyles = explode(';', $attrValue);
$explodedStylesCount = count($explodedStyles);
for ($n=0; $n<$explodedStylesCount; $n++) {
$splitStyle = explode(':', $explodedStyles[$n]);
$attributes[$attrName][trim($splitStyle[0])] = trim($splitStyle[1]);
}
} else {
$splitStyle = explode(':', $attrValue);
$attributes[$attrName][trim($splitStyle[0])] = trim($splitStyle[1]);
}
} else {
// Store the value directly in the $attributes array if this is not the style attribute
$attributes[$attrName] = $attrValue;
}
} else {
$attributes[trim($attribute)] = true;
}
}
// Push the html tag data to the result array
array_push($tags, array(
'name' => $matches[1][0],
'attributes' => $attributes,
'innerText' => strip_tags($matches[3][0]),
'children' => $this->_hasHtmlTag($matches[3][0]) ? $this->_getDataFromHtml($matches[3][0]) : null
));
// Remove the processed html tag from the html string
$tempHtml = substr($tempHtml, strlen($matches[0][0]));
} else {
array_push($tags, array(
'text' => $tempHtml
));
$tempHtml = '';
}
}
return $tags;
}
/**
* Function to convert a HTML string into an ODT string