#!/usr/bin/php -q
// Remove time limit
set_time_limit(0);
// Increase memory allocation
ini_set("memory_limit", "32M");
// Set error reporting level - use PHP's internal error handler
error_reporting(E_ALL);
// Set up list of words to look for
$terms = array('damn', 'boyfriend', 'girlfriend', 'angst', 'depression', 'fuck', 'shit', 'me', 'life', 'nothing', 'ohmigod', 'ew', 'nothing', 'parents', 'school', 'crap', 'cunt', 'bastard', 'love', 'hate', 'everyone', 'lousy', 'lonely', 'mean', 'bitch', 'stupid', 'consience', 'tears', 'despair', 'pants', 'sad', 'trash', 'heavy', 'emotion', 'boring', 'hot', 'boy', 'girl', 'grade', 'teacher');
// Alternatively you can use a blacklist, rather than whitelist approach.
// this is a bit fairer from a scientific point of view but nowhere near as interesting.
// If you prefer to exclude stop words rather than include only target words, use the following
// array instead of the one above, and also switch around lines 108 and 109.
//$stopwords = array("a", "about", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "computer", "com", "con", "could", "couldnt", "cry", "day", "de", "describe", "detail", "did", "didn", "do", "doesn", "don", "done", "down", "due", "during", "each", "een", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "en", "enough", "etc", "even", "ever", "every", "everywhere", "except", "few", "fifteen", "fifty", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "gm", "go", "going", "good", "got", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", "interest", "into", "ik", "is", "it", "its", "itself", "just", "keep", "know", "la", "le", "last", "latter", "latterly", "least", "less", "like", "ll", "ltd", "made", "many", "may", "meanwhile", "might", "mill", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "put", "que", "rather", "re", "really", "said", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "take", "ten", "than", "that", "the", "their", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thin", "things", "think", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "ve", "very", "via", "was", "way", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves");
// Set up replacements for common entities that PHP won't convert
$entities = array("'", " ", "<", ">", """);
$replacements = array("'", " ", "<", ">", "\"");
// Arrays for storing data
$counts = array();
$prevcounts = array();
$wordschanged = array();
// Output and input file handles
$op = false;
$ip = false;
$buffer = "";
// Infinite loop turns the script into a daemon
while(true) {
// Open a connection to the Sixapart stream
if (!is_resource($ip) or feof($ip)) {
$ip = fsockopen("updates.sixapart.com", 80, $errno, $errstr, 10);
if (!$ip) {
echo "$errstr ($errno)
\n";
exit;
}
$out = "GET /atom-stream.xml HTTP/1.1\r\n";
$out .= "Host: updates.sixapart.com\r\n";
$out .= "Connection: Close\r\n\r\n";
fwrite($ip, $out);
socket_set_blocking($ip,false);
}
// Open a controller channel to the Meteor server
if (!is_resource($op) or feof($op)) {
echo "Reconnecting to Meteor\n";
if (!($op = fsockopen("meteorcontroller", 4671, $errno, $errstr, 5))) {
echo "Meteor not responding\n";
sleep(5);
continue;
}
socket_set_blocking($op,false);
}
// Read from Sixapart stream
$buffer .= fread($ip, 4096);
// Check for new items
$nummatches = preg_match_all("//si", $buffer, $matches, PREG_PATTERN_ORDER+PREG_OFFSET_CAPTURE);
if ($nummatches > 0) {
foreach($matches[0] as $item) {
// Extract the content part
if (preg_match("/\s*(.*?)\s*<\/content>/si", $item[0], $m)) {
// Filter for non-ASCII posts (mainly non-English posts)
$nonascii = 0;
for ($i=min(50, strlen($m[1])-1); $i >= 0; $i--) {
if (ord($m[1][$i]) > 127) $nonascii++;
}
if ($nonascii < 10) {
// Decode and strip tags
$content = strip_tags(str_replace($entities, $replacements, (html_entity_decode($m[1]))));
// Remove any non-words (phone numbers, ascii art, wierd l33t-speak...)
$content = preg_replace("/[^a-z\-]+/i", " ", $content);
// Split resulting text into words
$words = explode(" ", $content);
$keytermcount=0;
echo "Indexing... ";
foreach ($words as $word) {
// Lowercase the word for comparison and check it's not a stop word (over 70% will be)
$word = strtolower($word);
if (in_array($word, $terms)) {
//if (strlen($word) > 1 and !in_array($word, $stopwords)) {
// Add the time of this occurence to the occurences list for this word
if (!isset($counts[$word])) $counts[$word] = array();
$counts[$word][] = time();
$keytermcount++;
// Add the word to the words changed list for quick reference later
if (!in_array($word, $wordschanged)) $wordschanged[] = $word;
}
}
echo sizeof($words)." words: ".$keytermcount." key terms.\n";
}
}
}
// Trim the input buffer to remove any whole items that we've processed
$cutoff = $matches[0][$nummatches-1][1]+strlen($matches[0][$nummatches-1][0]);
echo "Trimming buffer at $cutoff... ";
$buffer = substr($buffer, $cutoff);
echo "Done\n";
}
// Loop over the words that have been updated with new occurences
echo "Sending updates\n";
while (!empty($wordschanged)) {
$w = array_shift($wordschanged);
// Remove any occurences that are older than 5 mins
while (sizeof($counts[$w]) and $counts[$w][0] < (time()-300))
array_shift($counts[$w]);
// Calcuate the new 5 min rolling average and compare it to the previous one
$prev = (isset($prevcounts[$w])) ? $prevcounts[$w] : 0;
$now = round(sizeof($counts[$w])/5,1);
// If more than 2% different, send the update to Meteor
if (abs($now-$prev) > (0.02*$prev)) {
$out = "ADDMESSAGE angst {w:'".addslashes($w)."',c:".$now."}\n";
echo "> $out";
fwrite($op, $out);
$prevcounts[$w] = $now;
}
}
// Give Meteor time to respond - 10ms
usleep(10000);
$buf = fread($op, 4096);
// Reset the counts arrays
$newcounts = $counts;
unset($counts);
$counts = $newcounts;
unset($newcounts);
// Sleep for 500ms
// Since the rolling average is calculated with second resolution
// it's pointless looping more than twice a second
echo "Sleeping\n";
usleep(500000);
}