4 # FILE: SPT--Recommender.php
6 # Part of the Collection Workflow Integration System (CWIS)
7 # Copyright 2004-2013 Edward Almasy and Internet Scout Research Group
8 # http://scout.wisc.edu/cwis/
13 # ---- PUBLIC INTERFACE --------------------------------------------------
14 # define content field types
23 $ItemIdFieldName, $UserIdFieldName, $RatingFieldName,
26 # set default parameters
27 $this->ContentCorrelationThreshold = 1;
29 # save database object
32 # save new configuration values
33 $this->ItemTableName = $ItemTableName;
34 $this->RatingTableName = $RatingTableName;
35 $this->ItemIdFieldName = $ItemIdFieldName;
36 $this->UserIdFieldName = $UserIdFieldName;
37 $this->RatingFieldName = $RatingFieldName;
38 $this->ContentFields = $ContentFields;
40 # set default debug state
44 # set level for debugging output
51 # ---- recommendation methods
53 # recommend items for specified user
54 function Recommend($UserId, $StartingResult = 0, $NumberOfResults = 10)
58 print
"REC: Recommend(${UserId}, ${StartingResult},"
59 .
" ${NumberOfResults})<br>\n";
62 # load in user ratings
65 $DB->Query(
"SELECT ".$this->ItemIdFieldName.
", ".$this->RatingFieldName
66 .
" FROM ".$this->RatingTableName
67 .
" WHERE ".$this->UserIdFieldName.
" = ${UserId}");
68 while ($Row = $DB->FetchRow())
70 $Ratings[$Row[$this->ItemIdFieldName]] =
71 $Row[$this->RatingFieldName];
75 print
"REC: user has rated ".count($Ratings).
" items<br>\n";
78 # for each item that user has rated
80 foreach ($Ratings as $ItemId => $ItemRating)
82 # for each content correlation available for that item
83 $DB->Query(
"SELECT Correlation, ItemIdB "
84 .
"FROM RecContentCorrelations "
85 .
"WHERE ItemIdA = ${ItemId}");
86 while ($Row = $DB->FetchRow())
88 # multiply that correlation by normalized rating and add
89 # resulting value to recommendation value for that item
90 if (isset($RecVals[$Row[
"ItemIdB"]]))
92 $RecVals[$Row[
"ItemIdB"]] +=
93 $Row[
"Correlation"] * ($ItemRating - 50);
97 $RecVals[$Row[
"ItemIdB"]] =
98 $Row[
"Correlation"] * ($ItemRating - 50);
102 print
"REC: RecVal[".$Row[
"ItemIdB"].
"] = "
103 .$RecVals[$Row[
"ItemIdB"]].
"<br>\n";
109 print
"REC: found ".count($RecVals).
" total recommendations<br>\n";
112 # calculate average correlation between items
113 $ResultThreshold = $DB->Query(
"SELECT AVG(Correlation) "
114 .
"AS Average FROM RecContentCorrelations",
"Average");
115 $ResultThreshold = round($ResultThreshold) * 2;
117 # for each recommended item
118 foreach ($RecVals as $ItemId => $RecVal)
120 # remove item from list if user already rated it
121 if (isset($Ratings[$ItemId]))
123 unset($RecVals[$ItemId]);
127 # scale recommendation value back to match thresholds
128 $RecVals[$ItemId] = round($RecVal / 50);
130 # remove item from recommendation list if value is below threshold
131 if ($RecVals[$ItemId] < $ResultThreshold)
133 unset($RecVals[$ItemId]);
139 print
"REC: found ".count($RecVals).
" positive recommendations<br>\n";
142 # sort recommendation list by value
143 if (isset($RecVals)) { arsort($RecVals, SORT_NUMERIC); }
145 # save total number of results available
146 $this->NumberOfResultsAvailable = count($RecVals);
148 # trim result list to match range requested by caller
149 $RecValKeys = array_slice(
150 array_keys($RecVals), $StartingResult, $NumberOfResults);
151 $RecValSegment = array();
152 foreach ($RecValKeys as $Key)
154 $RecValSegment[$Key] = $RecVals[$Key];
157 # return recommendation list to caller
158 return $RecValSegment;
161 # add function to be called to filter returned recommendation list
164 # save filter function name
165 $this->FilterFuncs[] = $FunctionName;
168 # return number of recommendations generated
171 return $this->NumberOfResultsAvailable;
174 # return recommendation generation time
177 return $this->LastSearchTime;
180 # return list of items used to generate recommendation of specified item
183 # pull list of correlations from DB
184 $this->DB->Query(
"SELECT * FROM RecContentCorrelations, ".$this->RatingTableName
185 .
" WHERE (ItemIdA = ${RecommendedItemId}"
186 .
" OR ItemIdB = ${RecommendedItemId})"
187 .
" AND ".$this->UserIdFieldName.
" = ".$UserId
188 .
" AND (RecContentCorrelations.ItemIdA = "
189 .$this->RatingTableName.
".".$this->ItemIdFieldName
190 .
" OR RecContentCorrelations.ItemIdB = "
191 .$this->RatingTableName.
".".$this->ItemIdFieldName.
")"
192 .
" AND Rating >= 50 "
193 .
" ORDER BY Correlation DESC");
195 # for each correlation
196 $SourceList = array();
197 while ($Row = $this->DB->FetchRow())
199 # pick out appropriate item ID
200 if ($Row[
"ItemIdA"] == $RecommendedItemId)
202 $ItemId = $Row[
"ItemIdB"];
206 $ItemId = $Row[
"ItemIdA"];
209 # add item to recommendation source list
210 $SourceList[$ItemId] = $Row[
"Correlation"];
213 # return recommendation source list to caller
217 # dynamically generate and return list of items similar to specified item
222 print
"REC: searching for items similar to item \""
226 # make sure we have item IDs available
229 # start with empty array
230 $SimilarItems = array();
233 foreach ($this->ItemIds as $Id)
235 # if item is not specified item
238 # calculate correlation of item to specified item
240 $ItemId, $Id, $FieldList);
242 # if correlation is above threshold
243 if ($Correlation > $this->ContentCorrelationThreshold)
245 # add item to list of similar items
246 $SimilarItems[$Id] = $Correlation;
252 print
"REC: ".count($SimilarItems).
" similar items to item \""
253 .$ItemId.
"\" found<br>\n";
256 # filter list of similar items (if any)
257 if (count($SimilarItems) > 0)
262 print
"REC: ".count($SimilarItems).
" similar items to item \""
263 .$ItemId.
"\" left after filtering<br>\n";
267 # if any similar items left
268 if (count($SimilarItems) > 0)
270 # sort list of similar items in order of most to least similar
271 arsort($SimilarItems, SORT_NUMERIC);
274 # return list of similar items to caller
275 return $SimilarItems;
278 # dynamically generate and return list of recommended field values for item
283 print
"REC: generating field value recommendations for item \""
287 # start with empty array of values
290 # generate list of similar items
293 # if similar items found
294 if (count($SimilarItems) > 0)
296 # prune list of similar items to only top third of better-than-average
297 $AverageCorr = intval(array_sum($SimilarItems) / count($SimilarItems));
298 reset($SimilarItems);
299 $HighestCorr = current($SimilarItems);
300 $CorrThreshold = intval($HighestCorr - (($HighestCorr - $AverageCorr) / 3));
303 print
"REC: <i>Average Correlation: $AverageCorr"
304 .
" Highest Correlation:"
305 .
" $HighestCorr Correlation"
306 .
" Threshold: $CorrThreshold </i><br>\n";
308 foreach ($SimilarItems as $ItemId => $ItemCorr)
310 if ($ItemCorr < $CorrThreshold)
312 unset($SimilarItems[$ItemId]);
317 print
"REC: ".count($SimilarItems)
318 .
" similar items left after threshold pruning<br>\n";
322 foreach ($SimilarItems as $SimItemId => $SimItemCorr)
325 foreach ($this->ContentFields as $FieldName => $FieldAttributes)
327 # load field data for this item
328 $FieldData = $this->GetFieldValue($SimItemId, $FieldName);
330 # if field data is array
331 if (is_array($FieldData))
333 # for each field data value
334 foreach ($FieldData as $FieldDataVal)
336 # if data value is not empty
337 $FieldDataVal = trim($FieldDataVal);
338 if (strlen($FieldDataVal) > 0)
340 # increment count for data value
341 $RecVals[$FieldName][$FieldDataVal]++;
347 # if data value is not empty
348 $FieldData = trim($FieldData);
349 if (strlen($FieldData) > 0)
351 # increment count for data value
352 $RecVals[$FieldName][$FieldData]++;
359 $MatchingCountThreshold = 3;
360 foreach ($RecVals as $FieldName => $FieldVals)
362 # determine cutoff threshold
363 arsort($FieldVals, SORT_NUMERIC);
365 $HighestCount = current($FieldVals);
366 $AverageCount = intval(array_sum($FieldVals) / count($FieldVals));
367 $CountThreshold = intval($AverageCount
368 + (($HighestCount - $AverageCount) / 2));
369 if ($CountThreshold < $MatchingCountThreshold)
371 $CountThreshold = $MatchingCountThreshold;
375 print
"REC: <i>Field: $FieldName "
376 .
" Average Count: $AverageCount "
377 .
" Highest Count: $HighestCount "
378 .
" Count Threshold: $CountThreshold </i><br>\n";
381 # for each field data value
382 foreach ($FieldVals as $FieldVal => $FieldValCount)
384 # if value count is below threshold
385 if ($FieldValCount < $CountThreshold)
388 unset($RecVals[$FieldName][$FieldVal]);
394 print
"REC: found ".count($RecVals[$FieldName])
395 .
" recommended values for field \""
396 .$FieldName.
"\" after threshold pruning<br>\n";
401 # return recommended values to caller
406 # ---- database update methods
412 print
"REC: UpdateForItems(${StartingItemId},"
413 .
" ${NumberOfItems})<br>\n";
415 # make sure we have item IDs available
421 foreach ($this->ItemIds as $ItemId)
423 # if item ID is within requested range
424 if ($ItemId >= $StartingItemId)
426 # update recommender info for item
428 { print(
"REC: doing item ${ItemId}<br>\n"); }
432 # if we have done requested number of items
433 if ($ItemsUpdated >= $NumberOfItems)
438 print
"REC: bailing out with item ${ItemId}<br>\n";
445 # return ID of last resource updated to caller
453 print
"REC: updating for item \"".$ItemId.
"\"<br>\n";
456 # make sure we have item IDs available
459 # clear existing correlations for this item
460 $this->DB->Query(
"DELETE FROM RecContentCorrelations "
461 .
"WHERE ItemIdA = ${ItemId}");
464 foreach ($this->ItemIds as $Id)
466 # if full pass and item is later in list than current item
467 if (($FullPass == FALSE) || ($Id > $ItemId))
469 # update correlation value for item and target item
477 # drop all correlation entries referring to item
478 $this->DB->Query(
"DELETE FROM RecContentCorrelations "
479 .
"WHERE ItemIdA = ".$ItemId.
" "
480 .
"OR ItemIdB = ".$ItemId);
485 # get average correlation
486 $AverageCorrelation = $this->DB->Query(
"SELECT AVG(Correlation) "
487 .
"AS Average FROM RecContentCorrelations",
"Average");
489 # dump all below-average correlations
490 if ($AverageCorrelation > 0)
492 $this->DB->Query(
"DELETE FROM RecContentCorrelations "
493 .
"WHERE Correlation <= ${AverageCorrelation}");
503 if (self::$ItemIdCache === NULL)
505 $this->DB->Query(
"SELECT ".$this->ItemIdFieldName.
" AS Id FROM "
506 .$this->ItemTableName.
" ORDER BY ".$this->ItemIdFieldName);
507 self::$ItemIdCache = $this->DB->FetchColumn(
"Id");
509 return self::$ItemIdCache;
518 self::$CorrelationCache = NULL;
519 self::$ItemIdCache = NULL;
520 self::$ItemDataCache = NULL;
524 # ---- PRIVATE INTERFACE -------------------------------------------------
526 private $ContentCorrelationThreshold;
527 private $ContentFields;
528 private $ItemTableName;
529 private $RatingTableName;
530 private $ItemIdFieldName;
531 private $UserIdFieldName;
532 private $RatingFieldName;
535 private $FilterFuncs;
536 private $LastSearchTime;
537 private $NumberOfResultsAvailable;
540 static private $ItemIdCache = NULL;
541 static private $ItemDataCache = NULL;
542 static private $CorrelationCache = NULL;
546 # if item IDs not already loaded
547 if (!isset($this->ItemIds))
549 # load item IDs from DB
550 $this->DB->Query(
"SELECT ".$this->ItemIdFieldName.
" AS Id FROM "
551 .$this->ItemTableName.
" ORDER BY ".$this->ItemIdFieldName);
552 $this->ItemIds = array();
553 while ($Item = $this->DB->FetchRow())
555 $this->ItemIds[] = $Item[
"Id"];
562 # if data not already loaded
563 if (!isset(self::$ItemDataCache[$ItemId][$FieldName]))
565 # load field value from DB
566 $FieldValue = $this->GetFieldValue($ItemId, $FieldName);
568 # if field value is array
569 if (is_array($FieldValue))
571 # concatenate together text from array elements
572 $FieldValue = implode(
" ", $FieldValue);
575 # normalize text and break into word array
576 self::$ItemDataCache[$ItemId][$FieldName] =
580 # return cached data to caller
581 return self::$ItemDataCache[$ItemId][$FieldName];
584 # calculate content correlation between two items and return value to caller
588 if ($this->
DebugLevel > 10) { print(
"REC: calculating correlation"
589 .
" between items $ItemIdA and $ItemIdB<br>\n"); }
591 # order item ID numbers
592 if ($ItemIdA > $ItemIdB)
599 # if we already have the correlation
600 if (isset(self::$CorrelationCache[$ItemIdA][$ItemIdB]))
602 # retrieve correlation from cache
603 $TotalCorrelation = self::$CorrelationCache[$ItemIdA][$ItemIdB];
607 # if list of fields to correlate specified
608 if ($FieldList != NULL)
610 # create list with only specified fields
611 foreach ($FieldList as $FieldName)
613 $ContentFields[$FieldName] = $this->ContentFields[$FieldName];
619 $ContentFields = $this->ContentFields;
622 # for each content field
623 $TotalCorrelation = 0;
624 foreach ($ContentFields as $FieldName => $FieldAttributes)
626 # if field is of a type that we use for correlation
627 $FieldType = intval($FieldAttributes[
"FieldType"]);
628 if (($FieldType == self::CONTENTFIELDTYPE_TEXT)
629 || ($FieldType == self::CONTENTFIELDTYPE_CONTROLLEDNAME))
636 print
"REC: loaded ".count($ItemAData)
637 .
" terms for item #".$ItemIdA.
" and "
638 .count($ItemBData).
" terms for item #"
639 .$ItemIdB.
" for field \"".$FieldName.
"\"<br>\n";
642 # call appropriate routine to get correlation
645 case self::CONTENTFIELDTYPE_TEXT:
646 case self::CONTENTFIELDTYPE_CONTROLLEDNAME:
648 $ItemAData, $ItemBData);
652 # add correlation multiplied by weight to total
653 $TotalCorrelation += $Correlation * $FieldAttributes[
"Weight"];
657 # store correlation to cache
658 self::$CorrelationCache[$ItemIdA][$ItemIdB] = $TotalCorrelation;
661 # return correlation value to caller
664 print(
"REC: correlation between items $ItemIdA and $ItemIdB"
665 .
" found to be $TotalCorrelation<br>\n");
667 return $TotalCorrelation;
670 # calculate content correlation between two items and update in DB
673 if ($this->
DebugLevel > 6) { print(
"REC: updating correlation between"
674 .
" items $ItemIdA and $ItemIdB<br>\n"); }
676 # bail out if two items are the same
677 if ($ItemIdA == $ItemIdB) {
return; }
679 # calculate correlation
682 # save new correlation
756 # strip any HTML tags
757 $Text = strip_tags($Text);
759 # strip any punctuation
760 $Text = preg_replace(
"/,\\.\\?-\\(\\)\\[\\]\"/",
" ", $Text); #
"
762 # normalize whitespace
763 $Text = trim(preg_replace("/[\\s]+/
", " ", $Text));
765 # convert to all lower case
766 $Text = strtolower($Text);
768 # split text into arrays of words
769 $Words = explode(" ", $Text);
771 # filter out all stop words
772 $Words = array_diff($Words, $StopWords);
774 # return word array to caller
778 protected function CalcTextCorrelation($WordsA, $WordsB)
780 # get array containing intersection of two word arrays
781 $IntersectWords = array_intersect($WordsA, $WordsB);
783 # return number of words remaining as score
784 return count($IntersectWords);
787 protected function ContentCorrelation($ItemIdA, $ItemIdB, $NewCorrelation = -1)
789 # if item ID A is greater than item ID B
790 if ($ItemIdA > $ItemIdB)
798 # if new correlation value provided
799 if ($NewCorrelation != -1)
801 # if new value is above threshold
802 if ($NewCorrelation >= $this->ContentCorrelationThreshold)
804 # insert new correlation value in DB
805 $this->DB->Query("INSERT INTO RecContentCorrelations
"
806 ."(ItemIdA, ItemIdB, Correlation)
"
807 ."VALUES (${ItemIdA}, ${ItemIdB}, ${NewCorrelation})
");
809 # return correlation value is new value
810 $Correlation = $NewCorrelation;
815 # return value is zero
821 # retrieve correlation value from DB
822 $Correlation = $this->DB->Query(
823 "SELECT Correlation FROM RecContentCorrelations
"
824 ."WHERE ItemIdA = ${ItemIdA} AND ItemIdB = ${ItemIdB}
",
827 # if no value found in DB
828 if ($Correlation == FALSE)
830 # return value is zero
835 # return correlation value to caller
839 protected function FilterOnSuppliedFunctions($Results)
841 # if filter functions have been set
842 if (count($this->FilterFuncs) > 0)
845 foreach ($Results as $ResourceId => $Result)
847 # for each filter function
848 foreach ($this->FilterFuncs as $FuncName)
850 # if filter protected function return TRUE for result resource
851 if ($FuncName($ResourceId))
854 if ($this->DebugLevel > 2)
856 print("REC: filter callback rejected resource
"
857 ." ${ResourceId}<br>\n
");
859 unset($Results[$ResourceId]);
861 # bail out of filter func loop
868 # return filtered list to caller
RecommendFieldValues($ItemId, $FieldList=NULL)
UpdateForItems($StartingItemId, $NumberOfItems)
GetSourceList($UserId, $RecommendedItemId)
ContentCorrelation($ItemIdA, $ItemIdB, $NewCorrelation=-1)
const CONTENTFIELDTYPE_CONTROLLEDNAME
AddResultFilterFunction($FunctionName)
const CONTENTFIELDTYPE_DATE
__construct(&$DB, $ItemTableName, $RatingTableName, $ItemIdFieldName, $UserIdFieldName, $RatingFieldName, $ContentFields)
FilterOnSuppliedFunctions($Results)
GetItemIds()
Retrieve all item IDs.
UpdateForItem($ItemId, $FullPass=FALSE)
const CONTENTFIELDTYPE_NUMERIC
GetFieldData($ItemId, $FieldName)
UpdateContentCorrelation($ItemIdA, $ItemIdB)
Recommend($UserId, $StartingResult=0, $NumberOfResults=10)
CalcTextCorrelation($WordsA, $WordsB)
const CONTENTFIELDTYPE_TEXT
FindSimilarItems($ItemId, $FieldList=NULL)
const CONTENTFIELDTYPE_DATERAMGE
NormalizeAndParseText($Text)
CalculateContentCorrelation($ItemIdA, $ItemIdB, $FieldList=NULL)
static ClearCaches()
Clear internal caches of item and correlation data.