CSV.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. <?php
  2. // +----------------------------------------------------------------------+
  3. // | PHP Version 4 |
  4. // +----------------------------------------------------------------------+
  5. // | Copyright (c) 2002-2003 Tomas Von Veschler Cox |
  6. // +----------------------------------------------------------------------+
  7. // | This source file is subject to version 2.0 of the PHP license, |
  8. // | that is bundled with this package in the file LICENSE, and is |
  9. // | available at through the world-wide-web at |
  10. // | http://www.php.net/license/2_02.txt. |
  11. // | If you did not receive a copy of the PHP license and are unable to |
  12. // | obtain it through the world-wide-web, please send a note to |
  13. // | license@php.net so we can mail you a copy immediately. |
  14. // +----------------------------------------------------------------------+
  15. // | Authors: Tomas V.V.Cox <cox@idecnet.com> |
  16. // | |
  17. // +----------------------------------------------------------------------+
  18. //
  19. // $Id: CSV.php,v 1.13 2003/01/04 11:54:55 mj Exp $
  20. require_once 'PEAR.php';
  21. require_once 'File.php';
  22. /**
  23. * File class for handling CSV files (Comma Separated Values), a common format
  24. * for exchanging data.
  25. *
  26. * TODO:
  27. * - Usage example and Doc
  28. * - Use getPointer() in discoverFormat
  29. * - Add a line counter for being able to output better error reports
  30. * - Store the last error in GLOBALS and add File_CSV::getLastError()
  31. *
  32. * Wish:
  33. * - Support Mac EOL format
  34. * - Other methods like readAll(), writeAll(), numFields(), numRows()
  35. * - Try to detect if a CSV has header or not in discoverFormat()
  36. *
  37. * Known Bugs:
  38. * (they has been analyzed but for the moment the impact in the speed for
  39. * properly handle this uncommon cases is too high and won't be supported)
  40. * - A field which is composed only by a single quoted separator (ie -> ;";";)
  41. * is not handled properly
  42. * - When there is exactly one field minus than the expected number and there
  43. * is a field with a separator inside, the parser will throw the "wrong count" error
  44. *
  45. * @author Tomas V.V.Cox <cox@idecnet.com>
  46. * @package File
  47. */
  48. class File_CSV
  49. {
  50. /**
  51. * This raiseError method works in a different way. It will always return
  52. * false (an error occurred) but it will call PEAR::raiseError() before
  53. * it. If no default PEAR global handler is set, will trigger an error.
  54. *
  55. * @param string $error The error message
  56. * @return bool always false
  57. */
  58. function raiseError($error)
  59. {
  60. // If a default PEAR Error handler is not set trigger the error
  61. // XXX Add a PEAR::isSetHandler() method?
  62. if ($GLOBALS['_PEAR_default_error_mode'] == PEAR_ERROR_RETURN) {
  63. PEAR::raiseError($error, null, PEAR_ERROR_TRIGGER, E_USER_WARNING);
  64. } else {
  65. PEAR::raiseError($error);
  66. }
  67. return false;
  68. }
  69. /**
  70. * Checks the configuration given by the user
  71. *
  72. * @param array &$conf The configuration assoc array
  73. * @param string &$error The error will be written here if any
  74. */
  75. function _conf(&$conf, &$error)
  76. {
  77. // check conf
  78. if (!is_array($conf)) {
  79. return $error = "Invalid configuration";
  80. }
  81. if (isset($conf['sep'])) {
  82. if (strlen($conf['sep']) != 1) {
  83. return $error = 'Separator can only be one char';
  84. }
  85. } else {
  86. return $error = 'Missing separator (the "sep" key)';
  87. }
  88. if (!isset($conf['fields']) || !is_numeric($conf['fields'])) {
  89. return $error = 'The number of fields must be numeric (the "fields" key)';
  90. }
  91. if (isset($conf['quote'])) {
  92. if (strlen($conf['quote']) != 1) {
  93. return $error = 'The quote char must be one char (the "quote" key)';
  94. }
  95. } else {
  96. $conf['quote'] = null;
  97. }
  98. if (!isset($conf['crlf'])) {
  99. $conf['crlf'] = "\n";
  100. }
  101. }
  102. /**
  103. * Return or create the file descriptor associated with a file
  104. *
  105. * @param string $file The name of the file
  106. * @param array &$conf The configuration
  107. * @param string $mode The open node (ex: FILE_MODE_READ or FILE_MODE_WRITE)
  108. *
  109. * @return mixed A file resource or false
  110. */
  111. function getPointer($file, &$conf, $mode = FILE_MODE_READ)
  112. {
  113. static $resources = array();
  114. static $config;
  115. if (isset($resources[$file])) {
  116. $conf = $config;
  117. return $resources[$file];
  118. }
  119. File_CSV::_conf($conf, $error);
  120. if ($error) {
  121. return File_CSV::raiseError($error);
  122. }
  123. $config = $conf;
  124. PEAR::pushErrorHandling(PEAR_ERROR_RETURN);
  125. $fp = &File::_getFilePointer($file, $mode);
  126. PEAR::popErrorHandling();
  127. if (PEAR::isError($fp)) {
  128. return File_CSV::raiseError($fp);
  129. }
  130. $resources[$file] = $fp;
  131. if ($mode == FILE_MODE_READ && !empty($conf['header'])) {
  132. if (!File_CSV::read($file, $conf)) {
  133. return false;
  134. }
  135. }
  136. return $fp;
  137. }
  138. /**
  139. * Unquote data
  140. *
  141. * @param string $field The data to unquote
  142. * @param string $quote The quote char
  143. * @return string the unquoted data
  144. */
  145. function unquote($field, $quote)
  146. {
  147. // Incase null fields (form: ;;)
  148. if (!strlen($field)) {
  149. return $field;
  150. }
  151. if ($quote && $field{0} == $quote && $field{strlen($field)-1} == $quote) {
  152. return substr($field, 1, -1);
  153. }
  154. return $field;
  155. }
  156. /**
  157. * Reads a row of data as an array from a CSV file. It's able to
  158. * read memo fields with multiline data.
  159. *
  160. * @param string $file The filename where to write the data
  161. * @param array &$conf The configuration of the dest CSV
  162. *
  163. * @return mixed Array with the data read or false on error/no more data
  164. */
  165. function readQuoted($file, &$conf)
  166. {
  167. if (!$fp = File_CSV::getPointer($file, $conf, FILE_MODE_READ)) {
  168. return false;
  169. }
  170. $buff = $c = null;
  171. $ret = array();
  172. $i = 1;
  173. $in_quote = false;
  174. $quote = $conf['quote'];
  175. $f = $conf['fields'];
  176. while (($ch = fgetc($fp)) !== false) {
  177. $prev = $c;
  178. $c = $ch;
  179. // Common case
  180. if ($c != $quote && $c != $conf['sep'] && $c != "\n") {
  181. $buff .= $c;
  182. continue;
  183. }
  184. if ($c == $quote && $quote &&
  185. ($prev == $conf['sep'] || $prev == "\n" || $prev === null))
  186. {
  187. $in_quote = true;
  188. } elseif ($in_quote) {
  189. // When ends quote
  190. if ($c == $conf['sep'] && $prev == $conf['quote']) {
  191. $in_quote = false;
  192. } elseif ($c == "\n") {
  193. $sub = ($prev == "\r") ? 2 : 1;
  194. if ((strlen($buff) >= $sub) &&
  195. ($buff{strlen($buff) - $sub} == $quote))
  196. {
  197. $in_quote = false;
  198. }
  199. }
  200. }
  201. if (!$in_quote && ($c == $conf['sep'] || $c == "\n")) {
  202. // More fields than expected
  203. if (($c == $conf['sep']) && ((count($ret) + 1) == $f)) {
  204. while ($c != "\n") {
  205. $c = fgetc($fp);
  206. }
  207. File_CSV::raiseError("Read more fields than the ".
  208. "expected ".$conf['fields']);
  209. return true;
  210. }
  211. // Less fields than expected
  212. if (($c == "\n") && ($i != $f)) {
  213. File_CSV::raiseError("Read wrong fields number count: '". $i .
  214. "' expected ".$conf['fields']);
  215. return true;
  216. }
  217. if ($prev == "\r") {
  218. $buff = substr($buff, 0, -1);
  219. }
  220. $ret[] = File_CSV::unquote($buff, $quote);
  221. if (count($ret) == $f) {
  222. return $ret;
  223. }
  224. $buff = '';
  225. $i++;
  226. continue;
  227. }
  228. $buff .= $c;
  229. }
  230. return !feof($fp) ? $ret : false;
  231. }
  232. /**
  233. * Reads a "row" from a CSV file and return it as an array
  234. *
  235. * @param string $file The CSV file
  236. * @param array &$conf The configuration of the dest CSV
  237. *
  238. * @return mixed Array or false
  239. */
  240. function read($file, &$conf)
  241. {
  242. if (!$fp = File_CSV::getPointer($file, $conf, FILE_MODE_READ)) {
  243. return false;
  244. }
  245. // The size is limited to 4K
  246. if (!$line = fgets($fp, 4096)) {
  247. return false;
  248. }
  249. $fields = explode($conf['sep'], $line);
  250. if ($conf['quote']) {
  251. $last =& $fields[count($fields) - 1];
  252. // Fallback to read the line with readQuoted when guess
  253. // that the simple explode won't work right
  254. if (($last{strlen($last) - 1} == "\n"
  255. && $last{0} == $conf['quote']
  256. && $last{strlen(rtrim($last)) - 1} != $conf['quote'])
  257. ||
  258. (count($fields) != $conf['fields'])
  259. // XXX perhaps there is a separator inside a quoted field
  260. //preg_match("|{$conf['quote']}.*{$conf['sep']}.*{$conf['quote']}|U", $line)
  261. )
  262. {
  263. $len = strlen($line);
  264. fseek($fp, -1 * strlen($line), SEEK_CUR);
  265. return File_CSV::readQuoted($file, $conf);
  266. } else {
  267. $last = rtrim($last);
  268. foreach ($fields as $k => $v) {
  269. $fields[$k] = File_CSV::unquote($v, $conf['quote']);
  270. }
  271. }
  272. }
  273. if (count($fields) != $conf['fields']) {
  274. File_CSV::raiseError("Read wrong fields number count: '". count($fields) .
  275. "' expected ".$conf['fields']);
  276. return true;
  277. }
  278. return $fields;
  279. }
  280. /**
  281. * Internal use only, will be removed in the future
  282. *
  283. * @param string $str The string to debug
  284. * @access private
  285. */
  286. function _dbgBuff($str)
  287. {
  288. if (strpos($str, "\r") !== false) {
  289. $str = str_replace("\r", "_r_", $str);
  290. }
  291. if (strpos($str, "\n") !== false) {
  292. $str = str_replace("\n", "_n_", $str);
  293. }
  294. if (strpos($str, "\t") !== false) {
  295. $str = str_replace("\t", "_t_", $str);
  296. }
  297. echo "buff: ($str)\n";
  298. }
  299. /**
  300. * Writes a struc (array) in a file as CSV
  301. *
  302. * @param string $file The filename where to write the data
  303. * @param array $fields Ordered array with the data
  304. * @param array &$conf The configuration of the dest CSV
  305. *
  306. * @return bool True on success false otherwise
  307. */
  308. function write($file, $fields, &$conf)
  309. {
  310. if (!$fp = File_CSV::getPointer($file, $conf, FILE_MODE_WRITE)) {
  311. return false;
  312. }
  313. if (count($fields) != $conf['fields']) {
  314. File_CSV::raiseError("Wrong fields number count: '". count($fields) .
  315. "' expected ".$conf['fields']);
  316. return true;
  317. }
  318. $write = '';
  319. for ($i = 0; $i < count($fields); $i++) {
  320. if (!is_numeric($fields[$i]) && $conf['quote']) {
  321. $write .= $conf['quote'] . $fields[$i] . $conf['quote'];
  322. } else {
  323. $write .= $fields[$i];
  324. }
  325. if ($i < (count($fields) - 1)) {
  326. $write .= $conf['sep'];
  327. } else {
  328. $write .= $conf['crlf'];
  329. }
  330. }
  331. if (!fwrite($fp, $write)) {
  332. return File_CSV::raiseError('Can not write to file');
  333. }
  334. return true;
  335. }
  336. /**
  337. * Discover the format of a CSV file (the number of fields, the separator
  338. * and if it quote string fields)
  339. *
  340. * @param string the CSV file name
  341. * @return mixed Assoc array or false
  342. */
  343. function discoverFormat($file)
  344. {
  345. if (!$fp = @fopen($file, 'r')) {
  346. return File_CSV::raiseError("Could not open file: $file");
  347. }
  348. $seps = array("\t", ';', ':', ',');
  349. $matches = array();
  350. // Take the first 10 lines and store the number of ocurrences
  351. // for each separator in each line
  352. for ($i = 0; ($i < 10) && ($line = fgets($fp, 4096)); $i++) {
  353. foreach ($seps as $sep) {
  354. $matches[$sep][$i] = substr_count($line, $sep);
  355. }
  356. }
  357. $final = array();
  358. // Group the results by amount of equal ocurrences
  359. foreach ($matches as $sep => $res) {
  360. $times = array();
  361. $times[0] = 0;
  362. foreach ($res as $k => $num) {
  363. if ($num > 0) {
  364. $times[$num] = (isset($times[$num])) ? $times[$num] + 1 : 1;
  365. }
  366. }
  367. arsort($times);
  368. $fields[$sep] = key($times);
  369. $amount[$sep] = $times[key($times)];
  370. }
  371. arsort($amount);
  372. $sep = key($amount);
  373. $fields = $fields[$sep];
  374. if (empty($fields)) {
  375. return File_CSV::raiseError('Could not discover the separator');
  376. }
  377. $conf['fields'] = $fields + 1;
  378. $conf['sep'] = $sep;
  379. // Test if there are fields with quotes arround in the first 5 lines
  380. $quotes = '"\'';
  381. $quote = null;
  382. rewind($fp);
  383. for ($i = 0; ($i < 5) && ($line = fgets($fp, 4096)); $i++) {
  384. if (preg_match("|$sep([$quotes]).*([$quotes])$sep|U", $line, $match)) {
  385. if ($match[1] == $match[2]) {
  386. $quote = $match[1];
  387. break;
  388. }
  389. }
  390. if (preg_match("|^([$quotes]).*([$quotes])$sep|", $line, $match)
  391. || preg_match("|([$quotes]).*([$quotes])$sep\s$|Us", $line, $match))
  392. {
  393. if ($match[1] == $match[2]) {
  394. $quote = $match[1];
  395. break;
  396. }
  397. }
  398. }
  399. $conf['quote'] = $quote;
  400. fclose($fp);
  401. // XXX What about trying to discover the "header"?
  402. return $conf;
  403. }
  404. }
  405. ?>