Core  3.2
PHP API documentation
 All Data Structures Namespaces Files Functions Variables Pages
Class.Dcp_Utils_CSVFormatDetector.php
Go to the documentation of this file.
1 <?php
2 /*
3  * @author Anakeen
4  * @package FDL
5 */
6 
7 namespace Dcp\Utils\CSVFormatDetector;
8 
9 class Sample
10 {
11  public $str = null;
12  public $count = 0;
13  public $weight = 1;
14  public $score = 0;
15 
16  public function __construct($str, $count, $weight = 1)
17  {
18  $this->str = $str;
19  $this->count = $count;
20  $this->weight = $this->count * $weight;
21  }
22 }
23 
25 {
26  /**
27  * @var Sample[]
28  */
29  public $samples = array();
30 
31  public function add($str, $count, $weight = 1)
32  {
33  $this->samples[] = new Sample($str, $count, $weight);
34  }
35 
36  public function updateScores()
37  {
38  $totalWeight = 0;
39  foreach ($this->samples as & $sample) {
40  $totalWeight+= $sample->weight;
41  }
42  unset($sample);
43  foreach ($this->samples as & $sample) {
44  if ($totalWeight == 0) {
45  $sample->score = 0;
46  } else {
47  $sample->score = $sample->weight / $totalWeight;
48  }
49  }
50  unset($sample);
51  }
52 
53  public function getMergedSamples()
54  {
55  $this->updateScores();
56  $this->sortByScore($this->samples);
57  $mergedSamples = array();
58  foreach ($this->samples as & $sample) {
59  if (!isset($mergesSamples[$sample->str])) {
60  $mergesSamples[$sample->str] = array();
61  }
62  if (isset($mergedSamples[$sample->str]->str)) {
63  $mergedSamples[$sample->str]->count+= $sample->count;
64  $mergedSamples[$sample->str]->score+= $sample->score;
65  } else {
66  $mergedSamples[$sample->str] = $sample;
67  }
68  }
69  unset($sample);
70  $this->sortByScore($mergedSamples);
71  return array_values($mergedSamples);
72  }
73  /**
74  * Return sample with highest confidence score or NULL if there is no samples
75  *
76  * @return null|Sample
77  *
78  */
79  public function getCandidate($minConfidence = 0)
80  {
81  $samples = $this->getMergedSamples();
82  $samples = array_filter($samples, function (Sample & $sample) use ($minConfidence)
83  {
84  return ($sample->score >= $minConfidence);
85  });
86  if (count($samples) <= 0) {
87  return null;
88  }
89  return $samples[0];
90  }
91  protected function sortByScore(&$samples)
92  {
93  uasort($samples, function (Sample $a, Sample $b)
94  {
95  if ($a->score == $b->score) {
96  return 0;
97  }
98  return ($a->score < $b->score) ? 1 : -1;
99  });
100  }
101 
102  public function dump($merged = true)
103  {
104  if ($merged) {
105  $samples = $this->getMergedSamples();
106  } else {
107  $this->updateScores();
108  $samples = $this->samples;
109  }
110  foreach ($samples as & $sample) {
111  printf("\t{[%s], %s, %s}\n", $sample->str, $sample->count, $sample->score);
112  }
113  unset($sample);
114  }
115 }
116 /**
117  * Class Detector
118  *
119  * Try to detect CSV separator (either ";" or ",") and enclosure (either "'" or '"') by applying some rules to detect
120  * patterns and count scores when there is a match: highest score >= 75% wins!
121  *
122  * Not bulletproof, but it seems to perform correctly (i.e. as expected) with CSV from dynacase-core and tests.
123  *
124  * Observed confidence scores being either 0% or > 90% (no score observed in-between).
125  *
126  * @package Dcp\Utils\CSVFormatDetector
127  */
128 class Detector
129 {
130  public $debug;
131  public $separators;
132  public $enclosures;
133 
134  public function __construct()
135  {
136  $this->reset();
137  }
138  /**
139  * Reset detector's internal logic.
140  */
141  public function reset()
142  {
143  $this->debug = false;
144  $this->separators = array(
145  ',',
146  ';'
147  );
148  $this->enclosures = array(
149  "'",
150  '"'
151  );
152  }
153  /**
154  * Print a debug message if debug is enable.
155  *
156  * @param $msg
157  */
158  protected function debug($msg)
159  {
160  if ($this->debug === true) {
161  print $msg;
162  }
163  }
164  /**
165  * Detect the CSV separator and enclosure of the given CSV text data
166  *
167  * The returned array contains the detected separator and enclosure with their confidence score expressed as a
168  * floating point value in the range [0, 1]:
169  *
170  * Example:
171  *
172  * array(
173  * 'separator' => array(
174  * 'char' => ';'
175  * 'confidence' => 1
176  * ) ,
177  * 'enclosure' => array(
178  * 'char' => null,
179  * 'confidence' => 0
180  * )
181  * )
182  *
183  * null values indicates that the corresponding value could not be detected.
184  *
185  * @param $text
186  * @return array
187  */
188  public function detect($text)
189  {
190  $result = array(
191  'separator' => array(
192  'char' => null,
193  'confidence' => 0
194  ) ,
195  'enclosure' => array(
196  'char' => null,
197  'confidence' => 0
198  )
199  );
200 
201  if (!is_string($text)) {
202  return $result;
203  }
204  /*
205  * Detect separators
206  */
207  $stats = new SampleAccumulator();
208  foreach ($this->separators as $sep) {
209  $sepRE = preg_quote($sep, '/');
210  foreach (array(
211  // Trailing separators (minimum length of 2): e.g. xxx(;;)
212  sprintf('/(%s{2,})$/ms', $sepRE) => array(
213  'count' => 1,
214  'length' => 1
215  ) ,
216  // Consecutive separators (minimum length of 2): e.g. xxx(;;)xxx
217  sprintf('/(%s{2,})./ms', $sepRE) => array(
218  'count' => 1,
219  'length' => 1
220  ) ,
221  // Sequences of only separators and identifiers (with minimum identifiers length of 3)
222  sprintf('/((?:%s\w{3,})+)/uims', $sepRE) => array(
223  'count' => 1,
224  'length' => 1
225  )
226  ) as $re => $weightMultiplier) {
227  if (preg_match_all($re, $text, $m)) {
228  /*
229  * Add one for each pattern found:
230  * the more we match, the greater the score will get
231  */
232  $stats->add($sep, count($m[1]) , $weightMultiplier['count']);
233  foreach ($m[1] as $n) {
234  /*
235  * Add score with length of the found patterns:
236  * the longer we match, the greater the score will get
237  */
238  $stats->add($sep, strlen($n[1]) , $weightMultiplier['length']);
239  }
240  }
241  }
242  }
243  if ($this->debug) {
244  $stats->dump();
245  }
246  /* Get best separator candidate (i.e. the one with the highest confidence score) */
247  $candidate = $stats->getCandidate(0.75);
248  if ($candidate === null) {
249  $this->debug(sprintf(" Could not identify separator!\n"));
250  return $result;
251  }
252  $this->debug(sprintf(" Identified separator [%s] with %.02f%% confidence (count %d).\n", $candidate->str, 100 * $candidate->score, $candidate->count));
253  $this->debug(sprintf("\n"));
254  $sepChar = $candidate->str;
255  $result['separator']['char'] = $sepChar;
256  $result['separator']['confidence'] = $candidate->score;
257  /*
258  * Detect enclosures
259  */
260  $stats = new SampleAccumulator();
261  $sepRE = preg_quote($sepChar, '/');
262  foreach ($this->enclosures as $enc) {
263  $encRE = preg_quote($enc, '/');
264  foreach (array(
265  // Search for: xxx(;"xxx";)xxx
266  sprintf('/(%s%s[^%s]+%s(?:%s|$))/ms', $sepRE, $encRE, $encRE, $encRE, $sepRE) ,
267  ) as $re) {
268  if (preg_match_all($re, $text, $m)) {
269  $stats->add($enc, count($m[1]));
270  }
271  }
272  }
273  if ($this->debug) {
274  $stats->dump();
275  }
276  /* Get best enclosure candidate (i.e. the one with the highest confidence score) */
277  $candidate = $stats->getCandidate(0.75);
278  if ($candidate === null) {
279  $this->debug(sprintf(" Could not identify enclosure!\n"));
280  $this->debug(sprintf("\n"));
281  return $result;
282  }
283  $this->debug(sprintf(" Identified enclosure [%s] with %.02f%% confidence (count %d).\n", $candidate->str, 100 * $candidate->score, $candidate->count));
284  $this->debug(sprintf("\n"));
285  $encChar = $candidate->str;
286  $result['enclosure']['char'] = $encChar;
287  $result['enclosure']['confidence'] = $candidate->score;
288  return $result;
289  }
290 }
print< H1 > Check Database< i > $dbaccess</i ></H1 > $a
Definition: checklist.php:45
print
Definition: checklist.php:49
← centre documentaire © anakeen