Index: lams_build/build.xml =================================================================== diff -u -re98540848441375f30de49a3557a5c9b0e7bea99 -rf90ca89942ca1a4ebca88351343904dcfa0a654e --- lams_build/build.xml (.../build.xml) (revision e98540848441375f30de49a3557a5c9b0e7bea99) +++ lams_build/build.xml (.../build.xml) (revision f90ca89942ca1a4ebca88351343904dcfa0a654e) @@ -399,6 +399,10 @@ + + + + + + + + + + + Index: lams_build/lib/apache-poi/fontbox-2.0.28.jar =================================================================== diff -u Binary files differ Index: lams_build/lib/apache-poi/graphics2d-0.42.jar =================================================================== diff -u Binary files differ Index: lams_build/lib/apache-poi/jempbox-1.8.17.jar =================================================================== diff -u Binary files differ Index: lams_build/lib/apache-poi/pdfbox-2.0.28.jar =================================================================== diff -u Binary files differ Index: lams_build/lib/apache-poi/poi.module.xml =================================================================== diff -u -re98540848441375f30de49a3557a5c9b0e7bea99 -rf90ca89942ca1a4ebca88351343904dcfa0a654e --- lams_build/lib/apache-poi/poi.module.xml (.../poi.module.xml) (revision e98540848441375f30de49a3557a5c9b0e7bea99) +++ lams_build/lib/apache-poi/poi.module.xml (.../poi.module.xml) (revision f90ca89942ca1a4ebca88351343904dcfa0a654e) @@ -28,6 +28,10 @@ + + + + @@ -36,6 +40,7 @@ + Index: lams_build/lib/apache-tika/tika.module.xml =================================================================== diff -u -re98540848441375f30de49a3557a5c9b0e7bea99 -rf90ca89942ca1a4ebca88351343904dcfa0a654e --- lams_build/lib/apache-tika/tika.module.xml (.../tika.module.xml) (revision e98540848441375f30de49a3557a5c9b0e7bea99) +++ lams_build/lib/apache-tika/tika.module.xml (.../tika.module.xml) (revision f90ca89942ca1a4ebca88351343904dcfa0a654e) @@ -36,6 +36,7 @@ + \ No newline at end of file Index: lams_build/lib/jakarta-commons/commons-logging-1.2.jar =================================================================== diff -u Binary files differ Index: lams_build/lib/jakarta-commons/logging.module.xml =================================================================== diff -u --- lams_build/lib/jakarta-commons/logging.module.xml (revision 0) +++ lams_build/lib/jakarta-commons/logging.module.xml (revision f90ca89942ca1a4ebca88351343904dcfa0a654e) @@ -0,0 +1,29 @@ + + + + + + + + + \ No newline at end of file Index: lams_central/lams_central.eml =================================================================== diff -u -r21c7529af602718ef4a963e75e902c964f986831 -rf90ca89942ca1a4ebca88351343904dcfa0a654e --- lams_central/lams_central.eml (.../lams_central.eml) (revision 21c7529af602718ef4a963e75e902c964f986831) +++ lams_central/lams_central.eml (.../lams_central.eml) (revision f90ca89942ca1a4ebca88351343904dcfa0a654e) @@ -3,8 +3,9 @@ + - + Index: lams_common/lams_common.eml =================================================================== diff -u -r21c7529af602718ef4a963e75e902c964f986831 -rf90ca89942ca1a4ebca88351343904dcfa0a654e --- lams_common/lams_common.eml (.../lams_common.eml) (revision 21c7529af602718ef4a963e75e902c964f986831) +++ lams_common/lams_common.eml (.../lams_common.eml) (revision f90ca89942ca1a4ebca88351343904dcfa0a654e) @@ -1,38 +1,52 @@ - + + + + + + - + + + + + + + + + + Index: lams_common/lams_common.iml =================================================================== diff -u -r87a8f85eccc8c32edc96754f3a2c8f88f42f3085 -rf90ca89942ca1a4ebca88351343904dcfa0a654e --- lams_common/lams_common.iml (.../lams_common.iml) (revision 87a8f85eccc8c32edc96754f3a2c8f88f42f3085) +++ lams_common/lams_common.iml (.../lams_common.iml) (revision f90ca89942ca1a4ebca88351343904dcfa0a654e) @@ -1,13 +1,2 @@ - - - - - - - - - - - - \ No newline at end of file + \ No newline at end of file Index: lams_common/src/java/org/lamsfoundation/lams/util/FileUtil.java =================================================================== diff -u -r866e409c7358c310860593dbce3dcd894c706a9d -rf90ca89942ca1a4ebca88351343904dcfa0a654e --- lams_common/src/java/org/lamsfoundation/lams/util/FileUtil.java (.../FileUtil.java) (revision 866e409c7358c310860593dbce3dcd894c706a9d) +++ lams_common/src/java/org/lamsfoundation/lams/util/FileUtil.java (.../FileUtil.java) (revision f90ca89942ca1a4ebca88351343904dcfa0a654e) @@ -23,38 +23,20 @@ package org.lamsfoundation.lams.util; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.Reader; -import java.io.StringReader; -import java.io.StringWriter; -import java.io.UnsupportedEncodingException; -import java.net.URLEncoder; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.text.SimpleDateFormat; -import java.time.format.DateTimeFormatter; -import java.util.Collection; -import java.util.Date; -import java.util.Map; -import java.util.Properties; -import java.util.regex.Pattern; - -import javax.mail.internet.MimeUtility; -import javax.servlet.http.HttpServletRequest; -import javax.servlet.http.HttpSession; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerException; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; - +import com.thoughtworks.xstream.XStream; +import com.thoughtworks.xstream.converters.ConversionException; +import com.thoughtworks.xstream.io.xml.StaxDriver; +import com.thoughtworks.xstream.security.AnyTypePermission; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.pdf.PDFParser; +import org.apache.tika.parser.txt.CharsetDetector; +import org.apache.tika.parser.txt.CharsetMatch; +import org.apache.tika.sax.BodyContentHandler; import org.hibernate.id.Configurable; import org.hibernate.id.IdentifierGenerator; import org.hibernate.id.UUIDGenerator; @@ -65,16 +47,33 @@ import org.lamsfoundation.lams.web.session.SessionManager; import org.lamsfoundation.lams.web.util.AttributeNames; import org.w3c.dom.Document; - -import com.thoughtworks.xstream.XStream; -import com.thoughtworks.xstream.converters.ConversionException; -import com.thoughtworks.xstream.io.xml.StaxDriver; -import com.thoughtworks.xstream.security.AnyTypePermission; - +import org.xml.sax.SAXException; import xyz.capybara.clamav.ClamavClient; import xyz.capybara.clamav.ClamavException; import xyz.capybara.clamav.commands.scan.result.ScanResult; +import javax.mail.internet.MimeUtility; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpSession; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import java.io.*; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.text.SimpleDateFormat; +import java.time.format.DateTimeFormatter; +import java.util.Collection; +import java.util.Date; +import java.util.Map; +import java.util.Properties; +import java.util.regex.Pattern; + /** * General File Utilities */ @@ -84,8 +83,8 @@ public static final String ENCODING_UTF_8 = "UTF8"; public static final SimpleDateFormat EXPORT_TO_SPREADSHEET_TITLE_DATE_FORMAT = new SimpleDateFormat( "dd/MM/yyyy HH:mm:ss"); - public static final DateTimeFormatter EXPORT_TO_SPREADSHEET_TITLE_DATE_FORMATTER = DateTimeFormatter - .ofPattern("dd/MM/yyyy HH:mm:ss"); + public static final DateTimeFormatter EXPORT_TO_SPREADSHEET_TITLE_DATE_FORMATTER = DateTimeFormatter.ofPattern( + "dd/MM/yyyy HH:mm:ss"); public static final SimpleDateFormat EXPORT_TO_SPREADSHEET_CELL_DATE_FORMAT = new SimpleDateFormat("dd/MM/yyyy"); public static final String LAMS_WWW_SECURE_DIR = "secure"; @@ -95,8 +94,8 @@ private static final long numMilliSecondsInADay = 24 * 60 * 60 * 1000; // looks for URL with non-chopped content folder ID - private static final Pattern LEGACY_CONTENT_FOLDER_PATH = Pattern - .compile("/+lams/+www/+secure/+([0-9a-f\\-]{3,}/+).*?[\"']", Pattern.CASE_INSENSITIVE); + private static final Pattern LEGACY_CONTENT_FOLDER_PATH = Pattern.compile( + "/+lams/+www/+secure/+([0-9a-f\\-]{3,}/+).*?[\"']", Pattern.CASE_INSENSITIVE); public static final String ALLOWED_EXTENSIONS_FLASH = ".swf,.fla"; public static final String ALLOWED_EXTENSIONS_IMAGE = ".jpg,.gif,.jpeg,.png,.bmp"; @@ -167,7 +166,8 @@ /** * Check if this directory is empty. If checkSubdirectories = true, then it also checks its subdirectories to make * sure they aren't empty. If checkSubdirectories = true and the directory contains empty subdirectories it will - * return true. If checkSubdirectories = false and the directory contains empty subdirectories it will return false. + * return true. If checkSubdirectories = false and the directory contains empty subdirectories it will return + * false. */ public static boolean isEmptyDirectory(String directoryName, boolean checkSubdirectories) throws FileUtilException { @@ -212,8 +212,8 @@ * @param zipFileName * @return name of the new directory * @throws ZipFileUtilException - * if the java io temp directory is not defined, or we are unable to calculate a unique name for the - * expanded directory, or an IOException occurs. + * if the java io temp directory is not defined, or we are unable to calculate a unique name for the expanded + * directory, or an IOException occurs. */ public static String createTempDirectory(String suffix) throws FileUtilException { @@ -232,16 +232,18 @@ tempSysDirName = javaTemp; } - String tempDirName = tempSysDirName + File.separator + FileUtil.prefix - + FileUtil.generateUniqueContentFolderID() + "_" + suffix; + String tempDirName = + tempSysDirName + File.separator + FileUtil.prefix + FileUtil.generateUniqueContentFolderID() + "_" + + suffix; File tempDir = new File(tempDirName); // try 100 different variations. If I can't find a unique // one in 100 tries, then give up. int i = 0; while (tempDir.exists() && (i < 100)) { - tempDirName = tempSysDirName + File.separator + FileUtil.prefix + FileUtil.generateUniqueContentFolderID() - + "_" + suffix; + tempDirName = + tempSysDirName + File.separator + FileUtil.prefix + FileUtil.generateUniqueContentFolderID() + "_" + + suffix; tempDir = new File(tempDirName); i++; } @@ -262,10 +264,10 @@ * If the directoryname is null or an empty string, a FileUtilException is thrown * * @param directoryName - * the name of the directory to create + * the name of the directory to create * @return boolean. Returns true if the directory is created and false otherwise * @throws FileUtilException - * if the directory name is null or an empty string + * if the directory name is null or an empty string */ public static boolean createDirectory(String directoryName) throws FileUtilException { boolean isCreated = false; @@ -286,21 +288,20 @@ * * If the parent directory has not been created yet, it will be created. * - * * @param parentDirName - * The name of the parent directory in which the subdirectory should be created in + * The name of the parent directory in which the subdirectory should be created in * @param subDirName - * The name of the subdirectory to create + * The name of the subdirectory to create * @return boolean. Returns true if the subdirectory was created and false otherwise * @throws FileUtilException - * if the parent/child directory name is null or empty. + * if the parent/child directory name is null or empty. */ public static boolean createDirectory(String parentDirName, String subDirName) throws FileUtilException { boolean isSubDirCreated = false; boolean isParentDirCreated; - if ((parentDirName == null) || (parentDirName.length() == 0) || (subDirName == null) - || (subDirName.length() == 0)) { + if ((parentDirName == null) || (parentDirName.length() == 0) || (subDirName == null) || (subDirName.length() + == 0)) { throw new FileUtilException("A parent or subdirectory name must be specified"); } @@ -340,7 +341,7 @@ * Checks to see if there is a slash at the end of the string. * * @param stringToCheck - * the directoryName to check + * the directoryName to check * @return boolean. Returns true if there is a slash at the end and false if not. */ public static boolean trailingForwardSlashPresent(String stringToCheck) { @@ -360,7 +361,6 @@ /** * get file name from a string which may include directory information. For example : "c:\\dir\\ndp\\pp.txt"; will * return pp.txt.? If file has no path infomation, then just return input fileName. - * */ public static String getFileName(String fileName) { if (fileName == null) { @@ -383,7 +383,7 @@ * Get file directory info. * * @param fileName - * with path info. + * with path info. * @return return only path info with the given fileName */ public static String getFileDirectory(String fileName) { @@ -409,7 +409,7 @@ * * @param path * @param file - * could be file name,or sub directory path. + * could be file name,or sub directory path. * @return */ public static String getFullPath(String path, String file) { @@ -484,7 +484,7 @@ * Verify if a file with such extension is allowed to be uploaded. * * @param fileType - * file type can be of the following values:File, Image, Flash, Media + * file type can be of the following values:File, Image, Flash, Media * @param fileName */ public static boolean isExtensionAllowed(String fileType, String fileName) { @@ -550,10 +550,10 @@ * List files in temp directory older than numDays. * * @param numDays - * Number of days old that the directory should be to be deleted. Must be greater than 0 + * Number of days old that the directory should be to be deleted. Must be greater than 0 * @return array of files older than input date * @throws FileUtilException - * if numDays <= 0 + * if numDays <= 0 */ public static File[] getOldTempFiles(int numDays) throws FileUtilException { // Contract checking @@ -677,8 +677,9 @@ String contentFolderIDClean = contentFolderID.replace("-", ""); String contentDir = ""; for (int charIndex = 0; charIndex < 6; charIndex++) { - contentDir += contentFolderIDClean.substring(charIndex * 2, charIndex * 2 + 2) - + (isFileSystemPath ? File.separator : "/"); + contentDir += contentFolderIDClean.substring(charIndex * 2, charIndex * 2 + 2) + (isFileSystemPath + ? File.separator + : "/"); } return contentDir; } @@ -790,8 +791,8 @@ * String classname = path.substring(1, classFieldDelimiter); * String fieldname = path.substring(classFieldDelimiter + 1); */ - if ((fieldname == null) || fieldname.equals("") - || lastFieldRemoved.equals(classname + "." + fieldname)) { + if ((fieldname == null) || fieldname.equals("") || lastFieldRemoved.equals( + classname + "." + fieldname)) { // can't retry, so get out of here! break; } else { @@ -845,8 +846,9 @@ try { return Class.forName(classname); } catch (ClassNotFoundException e) { - FileUtil.log.error("Trying to remove unwanted fields from import but we can't find the matching class " - + classname + ". Aborting retry.", e); + FileUtil.log.error( + "Trying to remove unwanted fields from import but we can't find the matching class " + classname + + ". Aborting retry.", e); return null; } } @@ -959,4 +961,38 @@ File uploadDir = FileUtil.getTmpFileUploadDir(tmpFileUploadId); FileUtils.deleteQuietly(uploadDir); } -} + + public static String getPDFContents(File file) throws TikaException, IOException, SAXException { + + CharsetDetector detector = new CharsetDetector(); + String charset = "UTF-16LE"; + + try (InputStream inputStream = new BufferedInputStream(new FileInputStream(file))) { + detector.setText(inputStream); + CharsetMatch match = detector.detect(); + if (match != null) { + charset = match.getName(); + } + } + + BodyContentHandler handler = new BodyContentHandler(-1); + ParseContext pcontext = new ParseContext(); + Metadata metadata = new Metadata(); + PDFParser pdfparser = new PDFParser(); + try (InputStream inputStream = new FileInputStream(file)) { + pdfparser.parse(inputStream, handler, metadata, pcontext); + } + String contents = handler.toString().strip(); + if (log.isDebugEnabled()) { + log.debug("PDF contents:\n" + contents); + } + if (!StandardCharsets.UTF_8.name().equals(charset)) { + if (log.isDebugEnabled()) { + log.debug("Converting PDF contents from " + charset + " to UTF-8"); + } + byte[] contentsBytes = contents.getBytes(charset); + contents = new String(contentsBytes, StandardCharsets.UTF_8); + } + return contents; + } +} \ No newline at end of file