Blame | Letzte Änderung | Log anzeigen | RSS feed
<?php/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: *//*** File::CSV** PHP versions 4 and 5** Copyright (c) 2002-2008,* Tomas V.V.Cox <cox@idecnet.com>,* Helgi Þormar Þorbjörnsson <helgi@php.net>* All rights reserved.** Redistribution and use in source and binary forms, with or without* modification, are permitted provided that the following conditions are met:** * Redistributions of source code must retain the above copyright notice,* this list of conditions and the following disclaimer.* * Redistributions in binary form must reproduce the above copyright* notice, this list of conditions and the following disclaimer in the* documentation and/or other materials provided with the distribution.** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.** @category File* @package File* @author Tomas V.V.Cox <cox@idecnet.com>* @author Helgi Þormar Þorbjörnsson <helgi@php.net>* @copyright 2004-2011 The Authors* @license http://www.opensource.org/licenses/bsd-license.php New BSD License* @version CVS: $Id: CSV.php 309245 2011-03-15 02:02:41Z dufuz $* @link http://pear.php.net/package/File*/require_once 'PEAR.php';require_once 'File.php';/*** File class for handling CSV files (Comma Separated Values), a common format* for exchanging data.** TODO:* - Usage example and Doc* - Use getPointer() in discoverFormat* - Add a line counter for being able to output better error reports* - Store the last error in GLOBALS and add File_CSV::getLastError()** Wish:* - Other methods like readAll(), writeAll(), numFields(), numRows()* - Try to detect if a CSV has header or not in discoverFormat() (not possible with CSV)** Known Bugs:* (they has been analyzed but for the moment the impact in the speed for* properly handle this uncommon cases is too high and won't be supported)* - A field which is composed only by a single quoted separator (ie -> ;";";)* is not handled properly* - When there is exactly one field minus than the expected number and there* is a field with a separator inside, the parser will throw the "wrong count" error** Info about CSV and links to other sources* http://rfc.net/rfc4180.html** @author Tomas V.V.Cox <cox@idecnet.com>* @author Helgi Þormar Þorbjörnsson <helgi@php.net>* @package File* @license http://www.opensource.org/licenses/bsd-license.php New BSD License*/class File_CSV{/*** This raiseError method works in a different way. It will always return* false (an error occurred) but it will call PEAR::raiseError() before* it. If no default PEAR global handler is set, will trigger an error.** @param string $error The error message* @return bool always false*/function raiseError($error){// If a default PEAR Error handler is not set trigger the error// XXX Add a PEAR::isSetHandler() method?if ($GLOBALS['_PEAR_default_error_mode'] == PEAR_ERROR_RETURN) {PEAR::raiseError($error, null, PEAR_ERROR_TRIGGER, E_USER_WARNING);} else {PEAR::raiseError($error);}return false;}/*** Checks the configuration given by the user** @access private* @param string &$error The error will be written here if any* @param array &$conf The configuration assoc array* @return string error Returns a error message*/function _conf(&$error, &$conf){// check confif (!is_array($conf)) {return $error = 'Invalid configuration';}if (!isset($conf['fields']) || !(int)$conf['fields']) {return $error = 'The number of fields must be numeric (the "fields" key)';}if (isset($conf['sep'])) {if (strlen($conf['sep']) !== 1) {return $error = 'Separator can only be one char';}} elseif ($conf['fields'] > 1) {return $error = 'Missing separator (the "sep" key)';} else {// to avoid undefined index notices$conf['sep'] = ',';}if (isset($conf['quote'])) {if (strlen($conf['quote']) !== 1) {return $error = 'The quote char must be one char (the "quote" key)';}} else {$conf['quote'] = '"';}if (!isset($conf['crlf'])) {$conf['crlf'] = "\n";}if (!isset($conf['eol2unix'])) {$conf['eol2unix'] = true;}}/*** Return or create the file descriptor associated with a file** @param string $file The name of the file* @param array &$conf The configuration* @param string $mode The open node (ex: FILE_MODE_READ or FILE_MODE_WRITE)* @param boolean $reset if passed as true and resource for the file exists* than the file pointer will be moved to the beginning** @return mixed A file resource or false*/function getPointer($file, &$conf, $mode = FILE_MODE_READ, $reset = false){static $resources = array();if (isset($resources[$file][$mode])) {if ($reset) {fseek($resources[$file][$mode], 0);}return $resources[$file][$mode];}File_CSV::_conf($error, $conf);if ($error) {return File_CSV::raiseError($error);}PEAR::pushErrorHandling(PEAR_ERROR_RETURN);$fp = File::_getFilePointer($file, $mode);PEAR::popErrorHandling();if (PEAR::isError($fp)) {return File_CSV::raiseError($fp);}$resources[$file][$mode] = $fp;return $fp;}/*** Unquote data** @param array|string $data The data to unquote* @param string $quote The quote char* @return string the unquoted data*/function unquote($data, $quote){$isString = false;if (!is_array($data)) {$data = array($data);$isString = true;}// Get rid of escaped quotes$data = str_replace($quote.$quote, $quote, $data);$tmp = array();foreach ($data as $key => $field) {// Trim first the string.$field = trim($field);// Incase null fields (form: ;;)$field_len = strlen($field);if (!$field_len) {if ($isString) {return $field;}$tmp[$key] = $field;continue;}// excel compatif ($field[0] === '=' && $field[1] === '"') {$field = str_replace('="', '"', $field);--$field_len;}if ($field[0] === $quote && $field[$field_len - 1] === $quote) {// Get rid of quotes around the field$field = substr($field, 1, -1);}if ($isString) {$tmp = $field;} else {$tmp[$key] = $field;}}return $tmp;}/*** Reads a row of data as an array from a CSV file. It's able to* read memo fields with multiline data.** @param string $file The filename where to write the data* @param array &$conf The configuration of the dest CSV** @return mixed Array with the data read or false on error/no more data*/function readQuoted($file, &$conf){if (!$fp = File_CSV::getPointer($file, $conf, FILE_MODE_READ)) {return false;}$buff = $old = $prev = $c = '';$ret = array();$fields = 1;$in_quote = false;$quote = $conf['quote'];$f = (int)$conf['fields'];$sep = $conf['sep'];while (false !== $ch = fgetc($fp)) {$old = $prev;$prev = $c;$c = $ch;// Common caseif ($c != $quote && $c != $sep && $c != "\n" && $c != "\r") {$buff .= $c;continue;}// Start quote.if ($in_quote === false &&$quote && $c == $quote &&($prev == $sep || $prev == "\n" || $prev === null ||$prev == "\r" || $prev == '' || $prev == ' '|| $prev == '=' //excel compat)) {$in_quote = true;// excel compat, removing the = part but only if we are in a quoteif ($prev == '=') {$buff{strlen($buff) - 1} = '';}}if ($in_quote) {// When does the quote end, make sure it's not double quotedif ($c == $sep && $prev == $quote && $old != $quote) {$in_quote = false;} elseif ($c == $sep && $buff == $quote.$quote) {// In case we are dealing with double quote but empty value$in_quote = false;} elseif ($c == "\n" || $c == "\r") {$sub = ($prev == "\r") ? 2 : 1;$buff_len = strlen($buff);if ($buff_len >= $sub &&$buff[$buff_len - $sub] == $quote) {$in_quote = false;}}}if (!$in_quote && ($c == $sep || $c == "\n" || $c == "\r")) {$return = File_CSV::_readQuotedFillers($fp, $f, $fields, $ret,$buff, $quote, $c, $sep);if ($return !== false) {return $return;}if ($prev == "\r") {$buff = substr($buff, 0, -1);}// Convert EOL character to Unix EOL (LF).if ($conf['eol2unix']) {$buff = preg_replace('/(\r\n|\r)$/', "\n", $buff);// Below replaces things everywhere not just EOL//$buff = str_replace(array("\r\n", "\r"), "\n", $buff);}$ret[] = File_CSV::unquote($buff, $quote);if (count($ret) === $f) {return $ret;}$buff = '';++$fields;continue;}$buff .= $c;}/* If it's the end of the file and we still have something in buffer* then we process it since files can have no CL/FR at the end*/$feof = feof($fp);if ($feof && strlen($buff) > 0 && !in_array($buff, array("\r", "\n"))) {$ret[] = File_CSV::unquote($buff, $quote);if (count($ret) == $f) {return $ret;}}if ($feof && count($ret) !== $f) {$return = File_CSV::_readQuotedFillers($fp, $f, $fields, $ret,$buff, $quote, $c, $sep);if ($return !== false) {return $return;}}return !$feof ? $ret : false;}/*** Adds missing fields (empty ones)** @param resource $fp the file resource* @param string $f* @param integer $fields the field count* @param array $ret the processed fields in a array* @param string $buff the buffer before it gets put through unquote* @param string $quote Quote in use* @param string $c the char currently being worked with* @param string $sep Separator in use** @access private* @return array | boolean returns false if no data should return out.*/function _readQuotedFillers($fp, $f, $fields, $ret, $buff, $quote, &$c, $sep){// More fields than expectedif ($c == $sep && (count($ret) + 1) === $f) {// Seek the pointer into linebreak character.while (true) {$c = fgetc($fp);if ($c == "\n" || $c == "\r" || $c == '') {break;}}// Insert last field value.$ret[] = File_CSV::unquote($buff, $quote);return $ret;}// Less fields than expectedif (($c == "\n" || $c == "\r") && $fields !== $f) {// Insert last field value.$ret[] = File_CSV::unquote($buff, $quote);if (count($ret) === 1 && empty($ret[0])) {return array();}// Pair the array elements to fields count. - inserting empty values$ret_count = count($ret);$sum = ($f - 1) - ($ret_count - 1);$data = array_merge($ret, array_fill($ret_count, $sum, ''));return $data;}return false;}/*** Reads a "row" from a CSV file and return it as an array** @param string $file The CSV file* @param array &$conf The configuration of the dest CSV** @return mixed Array or false*/function read($file, &$conf){if (!$fp = File_CSV::getPointer($file, $conf, FILE_MODE_READ)) {return false;}// The size is limited to 4Kif (!$line = fgets($fp, 4096)) {return false;}if ($conf['fields'] === 1) {$fields = array($line);$field_count = 1;} else {$fields = explode($conf['sep'], $line);$field_count = count($fields);}$real_field_count = $field_count - 1;$check_char = $fields[$real_field_count];if ($check_char === "\n" || $check_char === "\r") {array_pop($fields);--$field_count;}$last =& $fields[$real_field_count];if ($field_count !== $conf['fields'] || $conf['quote']&& ($last !== ''&& (($last[0] === $conf['quote'] && $last[strlen(rtrim($last)) - 1] !== $conf['quote'])// excel support|| ($last[0] === '=' && $last[1] === $conf['quote'])// if the row has spaces or other extra chars before the quote//|| preg_match('|^\s+\\' . $conf['quote'] .'|', $last)))// XXX perhaps there is a separator inside a quoted field// || preg_match("|{$conf['quote']}.*{$conf['sep']}.*{$conf['quote']}|", $line)// The regex above is really slow|| ((count(explode(',', $line))) > $field_count)) {fseek($fp, -1 * strlen($line), SEEK_CUR);$fields = File_CSV::readQuoted($file, $conf);$fields = File_CSV::_processHeaders($fields, $conf);return $fields;}$fields = File_CSV::unquote($fields, $conf['quote']);if ($field_count != $conf['fields']) {File_CSV::raiseError("Read wrong fields number count: '". $field_count ."' expected ".$conf['fields']);return true;}$fields = File_CSV::_processHeaders($fields, $conf);return $fields;}/*** Process the field array being passed in and map the array over to* the header values if the configuration is set on.** @param array $fields The CSV row columns* @param array $conf File_CSV configuration** @return array Processed array*/function _processHeaders($fields, &$conf){static $headers = array();if (isset($conf['header']) && $conf['header'] == true && empty($headers)) {// read the first row and assign to $headers$headers = $fields;return $headers;}if (!empty($headers)) {$tmp = array();foreach ($fields as $k => $v) {if (isset($headers[$k])) {$tmp[$headers[$k]] = $v;}}$fields = $tmp;}return $fields;}/*** Internal use only, will be removed in the future** @param string $str The string to debug* @access private*/function _dbgBuff($str){if (strpos($str, "\r") !== false) {$str = str_replace("\r", "_r_", $str);}if (strpos($str, "\n") !== false) {$str = str_replace("\n", "_n_", $str);}if (strpos($str, "\t") !== false) {$str = str_replace("\t", "_t_", $str);}if ($str === null) {$str = '_NULL_';}if ($str === '') {$str = 'Empty string';}echo "buff: ($str)\n";}/*** Writes a struc (array) in a file as CSV** @param string $file The filename where to write the data* @param array $fields Ordered array with the data* @param array &$conf The configuration of the dest CSV** @return bool True on success false otherwise*/function write($file, $fields, &$conf){if (!$fp = File_CSV::getPointer($file, $conf, FILE_MODE_WRITE)) {return false;}$field_count = count($fields);if ($field_count != $conf['fields']) {File_CSV::raiseError("Wrong fields number count: '". $field_count ."' expected ".$conf['fields']);return true;}$write = '';$quote = $conf['quote'];for ($i = 0; $i < $field_count; ++$i) {// Write a single field$quote_field = false;// Only quote this field in the following cases:if (is_numeric($fields[$i])) {// Numeric fields should not be quoted} elseif (isset($conf['sep']) && (strpos($fields[$i], $conf['sep']) !== false)) {// Separator is present in field$quote_field = true;} elseif (strpos($fields[$i], $quote) !== false) {// Quote character is present in field$quote_field = true;} elseif (strpos($fields[$i], "\n") !== false|| strpos($fields[$i], "\r") !== false) {// Newline is present in field$quote_field = true;} elseif (!is_numeric($fields[$i]) && (substr($fields[$i], 0, 1) == " " || substr($fields[$i], -1) == " ")) {// Space found at beginning or end of field value$quote_field = true;}if ($quote_field) {// Escape the quote character within the field (e.g. " becomes "")$quoted_value = str_replace($quote, $quote.$quote, $fields[$i]);$write .= $quote . $quoted_value . $quote;} else {$write .= $fields[$i];}$write .= ($i < ($field_count - 1)) ? $conf['sep']: $conf['crlf'];}if (!fwrite($fp, $write, strlen($write))) {return File_CSV::raiseError('Can not write to file');}return true;}/*** Discover the format of a CSV file (the number of fields, the separator* and if it quote string fields)** @param string the CSV file name* @param array extra separators that should be checked for.* @return mixed Assoc array or false*/function discoverFormat($file, $extraSeps = array()){if (!$fp = @fopen($file, 'rb')) {return File_CSV::raiseError("Could not open file: $file");}// Set auto detect line ending for Mac EOL support$oldini = ini_get('auto_detect_line_endings');if ($oldini != '1') {ini_set('auto_detect_line_endings', '1');}// Take the first 30 lines and store the number of occurrences// for each separator in each line$lines = '';for ($i = 0; $i < 30 && !feof($fp) && $line = fgets($fp, 4096); $i++) {$lines .= $line;}fclose($fp);if ($oldini != '1') {ini_set('auto_detect_line_endings', $oldini);}$seps = array("\t", ';', ':', ',');$seps = array_merge($seps, $extraSeps);$matches = array();$quotes = '"\'';while ($lines != ($newLines = preg_replace('|((["\'])[^"]*(\2))|', '\2_\2', $lines))) {$lines = $newLines;}$eol = strpos($lines, "\r") ? "\r" : "\n";$lines = explode($eol, $lines);foreach ($lines as $line) {$orgLine = $line;foreach ($seps as $sep) {$line = preg_replace("|^[$quotes$sep]*$sep*([$quotes][^$quotes]*[$quotes])|sm", '_', $orgLine);// Find all seps that are within qoutes///FIXME ... counts legitimit lines as bad ones// In case there's a whitespace infront the field$regex = '|\s*?';// Match the first quote (optional), also optionally match = since it's excel stuff$regex.= "(?:\=?[$quotes])";$regex.= '(.*';// Don't match a sep if we are inside a quote// also don't accept the sep if it has a quote on the either side///FIXME has to be possible if we are inside a quote! (tests fail because of this)$regex.= "(?:[^$quotes])$sep(?:[^$quotes])";$regex.= '.*)';// Close quote (if it's present) and the sep (optional, could be end of line)$regex.= "(?:[$quotes](?:$sep?))|Ums";preg_match_all($regex, $line, $match);// Finding all seps, within quotes or not$sep_count = substr_count($line, $sep);// Real count$matches[$sep][] = $sep_count - count($match[0]);}}$final = array();// Group the results by amount of equal ocurrencesforeach ($matches as $sep => $res) {$times = array();$times[0] = 0;foreach ($res as $k => $num) {if ($num > 0) {$times[$num] = isset($times[$num]) ? $times[$num] + $num : 1;}}arsort($times);// Use max fields count.$fields[$sep] = max(array_flip($times));$amount[$sep] = $times[key($times)];}arsort($amount);$sep = key($amount);$conf['fields'] = $fields[$sep] + 1;$conf['sep'] = $sep;// Test if there are fields with quotes around in the first 30 lines$quote = null;$string = implode('', $lines);foreach (array('"', '\'') as $q) {if (preg_match_all("|$sep(?:\s*?)(\=?[$q]).*([$q])$sep?|Us", $string, $match)) {if ($match[1][0] == $match[2][0]) {$quote = $match[1][0];break;}}if (preg_match_all("|^(\=?[$q]).*([$q])$sep{0,1}|Ums", $string, $match)|| preg_match_all("|(\=?[$q]).*([$q])$sep\s$|Ums", $string, $match)) {if ($match[1][0] == $match[2][0]) {$quote = $match[1][0];break;}}}$conf['quote'] = $quote;return $conf;}/*** Front to call getPointer and moving the resource to the* beginning of the file* Reset it if you like.** @param string $file The name of the file* @param array &$conf The configuration* @param string $mode The open node (ex: FILE_MODE_READ or FILE_MODE_WRITE)** @return boolean true on success false on failure*/function resetPointer($file, &$conf, $mode){if (!File_CSV::getPointer($file, $conf, $mode, true)) {return false;}return true;}}