7 namespace Dcp\Utils\CSVFormatDetector;
20 $this->weight = $this->count *
$weight;
31 public function add($str, $count, $weight = 1)
33 $this->samples[] =
new Sample($str, $count, $weight);
39 foreach ($this->samples as & $sample) {
40 $totalWeight+= $sample->weight;
43 foreach ($this->samples as & $sample) {
44 if ($totalWeight == 0) {
47 $sample->score = $sample->weight / $totalWeight;
57 $mergedSamples = array();
58 foreach ($this->samples as & $sample) {
59 if (!isset($mergesSamples[$sample->str])) {
60 $mergesSamples[$sample->str] = array();
62 if (isset($mergedSamples[$sample->str]->str)) {
63 $mergedSamples[$sample->str]->count+= $sample->count;
64 $mergedSamples[$sample->str]->score+= $sample->score;
66 $mergedSamples[$sample->str] = $sample;
71 return array_values($mergedSamples);
84 return ($sample->score >= $minConfidence);
95 if ($a->score == $b->score) {
98 return ($a->score < $b->score) ? 1 : -1;
102 public function dump($merged =
true)
105 $samples = $this->getMergedSamples();
107 $this->updateScores();
108 $samples = $this->samples;
110 foreach ($samples as & $sample) {
111 printf(
"\t{[%s], %s, %s}\n", $sample->str, $sample->count, $sample->score);
143 $this->debug =
false;
144 $this->separators = array(
148 $this->enclosures = array(
160 if ($this->debug ===
true) {
191 'separator' => array(
195 'enclosure' => array(
201 if (!is_string($text)) {
208 foreach ($this->separators as $sep) {
209 $sepRE = preg_quote($sep,
'/');
212 sprintf(
'/(%s{2,})$/ms', $sepRE) => array(
217 sprintf(
'/(%s{2,})./ms', $sepRE) => array(
222 sprintf(
'/((?:%s\w{3,})+)/uims', $sepRE) => array(
226 ) as $re => $weightMultiplier) {
227 if (preg_match_all($re, $text, $m)) {
232 $stats->add($sep, count($m[1]) , $weightMultiplier[
'count']);
233 foreach ($m[1] as $n) {
238 $stats->add($sep, strlen($n[1]) , $weightMultiplier[
'length']);
247 $candidate = $stats->getCandidate(0.75);
248 if ($candidate === null) {
249 $this->debug(sprintf(
" Could not identify separator!\n"));
252 $this->debug(sprintf(
" Identified separator [%s] with %.02f%% confidence (count %d).\n", $candidate->str, 100 * $candidate->score, $candidate->count));
253 $this->debug(sprintf(
"\n"));
254 $sepChar = $candidate->str;
255 $result[
'separator'][
'char'] = $sepChar;
256 $result[
'separator'][
'confidence'] = $candidate->score;
261 $sepRE = preg_quote($sepChar,
'/');
262 foreach ($this->enclosures as $enc) {
263 $encRE = preg_quote($enc,
'/');
266 sprintf(
'/(%s%s[^%s]+%s(?:%s|$))/ms', $sepRE, $encRE, $encRE, $encRE, $sepRE) ,
268 if (preg_match_all($re, $text, $m)) {
269 $stats->add($enc, count($m[1]));
277 $candidate = $stats->getCandidate(0.75);
278 if ($candidate === null) {
279 $this->debug(sprintf(
" Could not identify enclosure!\n"));
280 $this->debug(sprintf(
"\n"));
283 $this->debug(sprintf(
" Identified enclosure [%s] with %.02f%% confidence (count %d).\n", $candidate->str, 100 * $candidate->score, $candidate->count));
284 $this->debug(sprintf(
"\n"));
285 $encChar = $candidate->str;
286 $result[
'enclosure'][
'char'] = $encChar;
287 $result[
'enclosure'][
'confidence'] = $candidate->score;
print< H1 > Check Database< i > $dbaccess</i ></H1 > $a