<?php
	$skel["overload_crawler_version"] = "0.0.19";
	$skel["overload_crawler_lastmodified"] = "2006-02-04";

	$debuglogging = true;

	define('MAGPIE_DEBUG', 1);
	define('MAGPIE_OUTPUT_ENCODING', 'UTF-8');
	//define('MAGPIE_OUTPUT_ENCODING', 'ISO-8859-1');

	//define("MAGPIE_INPUT_ENCODING", "ISO-8859-1");
	define('MAGPIE_DETECT_ENCODING', true);

	/*
	define('MAGPIE_OUTPUT_ENCODING', 'UTF-8');
	define('MAGPIE_INPUT_ENCODING', 'UTF-8');
	define('MAGPIE_DETECT_ENCODING', true);
	*/

	define('MAGPIE_CACHE_DIR', $skel["overload_cachedir"]);
	define('MAGPIE_CACHE_ON', 1);
	define('MAGPIE_CACHE_AGE', 3600);
	define('MAGPIE_CONDITIONAL_GET_ON', 1);
	define('MAGPIE_CACHE_FRESH_ONLY', true);

	/* Define our own user agent string */
	define('MAGPIE_USER_AGENT', "overload feedreader/" . $skel["overload_crawler_version"] . " (https://overload.aquariusoft.org/)");
	
	require ("magpierss/rss_fetch.inc");

	define("ITEM_TOBEADDED", 0);
	define("ITEM_NOACTION", -1);

	//ini_set("user_agent","Opera/8.0 (compatible; MSIE 6.0; Windows NT 5.0)\r\n");
	//ini_set("user_agent","overload feedreader crawler [http://aquariusoft.org/overload/]\r\n");


	/*** Crawler functions ***/

	/*
	 * Get feed url/id pairs for all feeds
	 */
	function getFeedList($skel, $userid)
	{
		if ($userid == -1)
		{
			debugtxt("Get all feeds...");
			/* fetch all feeds' url and id */
			$query = 'SELECT overload_feed.uri, overload_feed.id FROM overload_feed;';
		} else
		{
			$query = 'SELECT overload_feed.uri, overload_feed.id FROM overload_feed ' .
				'WHERE overload_feed.id = overload_userfeed.feedid ' .
				'AND overload_userfeed.userid = ' . $userid . ';';
		}

		$list = array();

		$result = doQuery($skel, $query);
		if ( $result != null )
		{
			for ($i = 0; $i < mysql_num_rows( $result ); $i++)
			{
				$row = mysql_fetch_row($result);
				$list[$i]["uri"] = $row[0];
				$list[$i]["id"] = $row[1];
			}
			return $list;
		} else
		{
			return null;
		}

	}


	function updateFeed($skel, $feedid, $feed)
	{
		/* Check whether the feed's info has changed */
		$currentinfo = getFeed($skel, $feedid);
		$feedinfo = getFeedInfo($skel, $feed);

		$feedname = $feedinfo["name"];
		if ($feedinfo["name"] == "" && $currentinfo["name"] != "")
		{
			$feedname = $currentinfo["name"];
		}

		if ($currentinfo["name"] != $feedname
			|| $currentinfo["subtitle"] != $feedinfo["subtitle"]
			|| $currentinfo["feedimage"] != $feedinfo["feedimage"]
			|| $currentinfo["feedimagedesc"] != $feedinfo["feedimagedesc"])
		{
			$query = 'UPDATE overload_feed ' .
				'SET name="' . mysql_real_escape_string($feedname) . '", ' .
				'subtitle="' . mysql_real_escape_string($feedinfo["subtitle"]) . '", ' .
				'feedimage="' . mysql_real_escape_string($feedinfo["feedimage"]) . '", ' .
				'feedimagedesc="' . mysql_real_escape_string($feedinfo["feedimagedesc"]) . '" ' .
				'WHERE overload_feed.id=' . $feedid . ';';
			$result = doQuery($skel, $query);
			return true;
		}
		return false;
	}


	/*
	 * Crawls every feed in the DB. Should be a cronjob for once an hour
	 * @TODO: make it parallizable
	 */
	function crawlFeeds($skel)
	{
		/* Get feeds. -1 means all */
		$feeds = getFeedList($skel, -1);

		for ($i = 0; $i < count($feeds); $i++)
		{
			debugtxt("Processing [" . $feeds[$i]["id"] . "] " . $feeds[$i]["uri"] . "...");
			$uri = $feeds[$i]["uri"];
			// @TODO: calculate number of subscribers into $nrofsubscribers
			//@TODO: update // define('MAGPIE_USER_AGENT', "overload feedreader crawler/" . $skel["overload_crawler_version"] . " (https://aquariusoft.org/overload/; " . $nrofsubscribers . ")");
			$rss = fetch_rss($feeds[$i]["uri"]);
			if ($rss != null)
			{
				storeFeed($skel, $feeds[$i]["id"], $rss);
				updateFeed($skel, $feeds[$i]["id"], $rss);
				feedIsUpdated($skel, $feeds[$i]["id"], true);
			} else
			{
				feedIsUpdated($skel, $feeds[$i]["id"], false);
			}
		}
	}


	/*
	 * Returns Name, uri, image uri etc of the feed
	 */
	function getFeedInfo($skel, $feed)
	{
		$feedinfo["siteuri"] = $feed->channel["link"];
		$feedinfo["name"] = $feed->channel["title"];
		$feedinfo["subtitle"] = "";
		if (isset($feed->channel["description"]))
		{
			$feedinfo["subtitle"] = $feed->channel["description"];
		}
		if (trim(strip_tags($feedinfo["subtitle"])) == "")
		{
			$feedinfo["subtitle"] = "";
		}
		if (count($feed->image) > 0)
		{
			$feedinfo["feedimage"] = $feed->image["url"];
			$feedinfo["feedimagedesc"] = $feed->image["title"];
		} else
		{
			$feedinfo["feedimage"] = "";
			$feedinfo["feedimagedesc"] = "";
		}

		return $feedinfo;
	}


	/*
	 * Update feed in DB
	 */
	function storeFeed($skel, $feedid, $feed)
	{
		updateFeed($skel, $feedid, $feed);

		$itemsWithLinks = array();
		$linkingItems = 0;

		foreach ($feed->items as $item)
		{
			$uri = "";
			if (isset($item["link"]))
			{
				$uri = $item["link"];
			}
			if ($uri == "" && isset($item["guid"]))
			{
				$uri = $item["guid"];
			}
			$content = "";
			if (isset($item["description"]))
			{
				$content = $item["description"];
			}
			if (isset($item["content"]["encoded"]))
			{
				$content = $item["content"]["encoded"];
			}

			/* Clean up crappy content a bit */
			$content = str_replace("& ", "&amp; ", $content);
			$content = str_replace("<br>", "<br />", $content);
			$content = str_replace("<BR>", "<br />", $content);
			/* Encode 'weird' characters to their right html code */
			$content = str_replace("", "&auml;", $content);
			$content = str_replace("", "&atilde;", $content);
			$content = str_replace("", "&eacute;", $content);
			$content = str_replace("", "&egrave;", $content);
			$content = str_replace("", "&euml;", $content);
			$content = str_replace("", "&iuml;", $content);
			$content = str_replace("", "&iacute;", $content);
			$content = str_replace("", "&igrave;", $content);
			$content = str_replace("", "&Iacute;", $content);
			$content = str_replace("", "&ntilde;", $content);
			$content = str_replace("", "&ouml;", $content);
			$content = str_replace("", "&oslash;", $content);
			$content = str_replace("", "&Oslash;", $content);
			$content = str_replace("", "&uuml;", $content);

			$title = "";
			if (isset($item["title"]))
			{
				$title = $item["title"];
			}
			if ($title == "")
			{
				$title = substr(strip_tags($content), 0, 60);
				if (strlen(strip_tags($content)) > 60)
				{
					$title .= "...";
				}
			}
			$title = str_replace("& ", "&amp; ", $title);
			$title = str_replace("&amp;#", "&#", $title);
			$title = trim($title);
			if ($title == "")
			{
				/* Post only contains image or something */
				$datetimestamp = time();
				$itemdtstamp = getItemDatestamp($item);
				if ($itemdtstamp != "")
				{
					$datetimestamp = $itemdtstamp;
				}
				$title = "[no title - " . $datetimestamp . "]";
			}

			$author = "";
			if (isset($item["author"]) && $item["author"] != "")
			{
				$author = $item["author"];
			} else if (isset($item["dc"]["creator"]) && $item["dc"]["creator"] != "")
			{
				$author = $item["dc"]["creator"];
			}
			$author = trim($author);

			$action = itemShouldUpdate($skel, $feedid, $uri, $title, $content, $author);
			if ($action > 0)
			{
				/* id is stored in $action */
				updateItem($skel, $item, $title, $uri, $content, $author, $action);
				cleanContainsURIs($skel, $action);
				if (addContainsURIs($skel, $action, $uri, $content) > 0)
				{
					$itemsWithLinks[$linkingItems] = $action;
					$linkingItems++;
				}
			} else if ($action == ITEM_TOBEADDED)
			{
				$itemid = addItem($skel, $feedid, $title, $uri, $author, $item, $content);
				if (addContainsURIs($skel, $itemid, $uri, $content) > 0)
				{
					$itemsWithLinks[$linkingItems] = $itemid;
					$linkingItems++;
				}
			}
			/* Else, it's already cached and no action has to take place */
		}
/*
		if ($linkingItems > 0)
		{
			updateRelatedItems($skel, $itemsWithLinks);
		}
*/
	}


	function addItem($skel, $feedid, $title, $uri, $author, $item, $content)
	{
		debugtxt("addItem: feed: " . $feedid . ", item: " . $item["title"]);
		$datetimestamp = time();
		$itemdtstamp = getItemDatestamp($item);
		if ($itemdtstamp != "")
		{
			$datetimestamp = $itemdtstamp;
		}

		$query = 'INSERT INTO overload_item ' .
			'SET feedid=' . $feedid . ', title="' . mysql_real_escape_string($title) . '", content="' . mysql_real_escape_string($content) . '", ' .
			'author="' . mysql_real_escape_string($author) . '", ' .
			'uri="' . $uri . '", date="' . date("Y-m-d G:i:s", time()) . '", ' .
			'datetimestamp="' . $datetimestamp . '";';
		$result = doQuery($skel, $query);

		/* Get ID of the inserted item */
		$itemid = getLastInsertID($skel);

		$enclosures = $item["enclosure"];
		for ($i = 0; $i < count($enclosures); $i++)
		{
			addEnclosure($skel, $itemid, $enclosures[$i]);
		}

		/* Get all users that are subscribed to this feed */
		$query = 'SELECT overload_userfeed.userid FROM overload_userfeed ' .
			'WHERE overload_userfeed.feedid=' . $feedid . ';';
		$users = doQuery($skel, $query);

		for ($i = 0; $i < mysql_num_rows( $users ); $i++)
		{
			$row = mysql_fetch_row($users);
			$userid = $row[0];

			/* Update user's items: add this item */
			$query = 'INSERT INTO overload_useritem ' .
				'SET userid=' . $userid . ', ' .
				'itemid=' . $itemid . ', isread=0;';
			$result = doQuery($skel, $query);
		}
		return $itemid;;
	}


	function updateItem($skel, $item, $title, $uri, $content, $author, $id)
	{
		debugtxt("== updateItem: item: " . $item["title"]);
		//
		$datetimestamp = time();
		$itemdtstamp = getItemDatestamp($item);
		if ($itemdtstamp != "")
		{
			$datetimestamp = $itemdtstamp;
		}

		$query = 'UPDATE overload_item ' .
			'SET title="' . mysql_real_escape_string($title) . '", content="' . mysql_real_escape_string($content) . '", ' .
			'uri="' . $uri . '", date="' . date("Y-m-d G:i:s", time()) . '", ' .
			'author="' . mysql_real_escape_string($author) . '", ' .
			'postmodifiedat="' . date("Y-m-d G:i:s", $datetimestamp) . '" ' .
			//'datetimestamp="' . $datetimestamp . '";' .
			'WHERE overload_item.id=' . $id . ';';

		$result = doQuery($skel, $query);

		removeEnclosures($skel, $id);
		$enclosures = $item["enclosure"];
		for ($i = 0; $i < count($enclosures); $i++)
		{
			addEnclosure($skel, $itemid, $enclosures[$i]);
		}

		/* Set user's item to unread where applicable */
		$query = 'UPDATE overload_useritem ' .
			'SET isread=0 ' .
			'WHERE itemid=' . $id . ';';

		$result = doQuery($skel, $query);
	}


	/*
	 * Figures out if the item has been updated since last time
	 */
	function itemShouldUpdate($skel, $feedid, $uri, $title, $content, $author)
	{
//echo "should update? uri: " . $uri . "\n";
		$query = 'SELECT overload_item.id, overload_item.title, overload_item.content, overload_item.author FROM overload_item ' . 
			'WHERE overload_item.uri = "' . $uri . '" ' .
			'AND overload_item.title = "' . mysql_real_escape_string($title) . '" ' . 
			'AND overload_item.feedid=' . $feedid . ';';
		$result = doQuery($skel, $query);
		if ($result == null)
		{
			/* Item doesn't exist yet [or title has changed], so no update but add */
//echo "should add\n";
			return ITEM_TOBEADDED;
		} else
		{
			$row = mysql_fetch_row($result);

			if ($row[1] == "" && $title != "")
			{
				return ITEM_TOBEADDED;
			}
			
			if ($content != $row[2] || $author != $row[3])
			{
				/* Item has been updated, return it's id */
//echo "should update!\n";
				debugtxt("== should update! content differs | title=\"" . $title . "\" | old title=\"" . $row[1] . "\"");
				debugtxt("uri: " . $uri . "<br />\n======");
				return $row[0];
			} else
			{
//echo "doesn't need an update\n";
				return ITEM_NOACTION;
			}
		}
	}


	function feedIsUpdated($skel, $feedid, $success)
	{
		$query = 'UPDATE overload_feed ' .
			'SET lastupdate="' . date("Y-m-d G:i:s", time()) . '", lastupdatesuccessful=1 ' .
			'WHERE overload_feed.id=' . $feedid . ';';
		if (!$success)
		{
			$query = 'UPDATE overload_feed ' .
				'SET lastupdatesuccessful=0 ' .
				'WHERE overload_feed.id=' . $feedid . ';';
		}
		doQuery($skel, $query);
	}


	/*** RSS date functions ***/

	function getItemDatestamp($item)
	{
		// set default undefined value
		$in_date = "";

		// check for RSS 2 as pubdate
		$rss_2_date = $item["pubdate"];

		// check for date defioned in dc:date
		$rss_1_date = $item["dc"]["date"];

		// check for atom date
		$atom_date = $item["issued"];

		// convert to appropriate unux time
		if ($atom_date != "") $in_date = parse_w3cdtf($atom_date);
		if ($rss_1_date != "")
		{
			if (strlen($rss_1_date) > 10)
			{
				$in_date = parse_w3cdtf($rss_1_date);
			} else
			{
				// possibly of the form yyyy-mm-dd
				if (ereg ("([0-9]{4})-([0-9]{1,2})-([0-9]{1,2})", $rss_1_date, $regs))
				{
					// calc epoch for current date assuming GMT
					$in_date = gmmktime( 0, 0, 0, $regs[2], $regs[3], $regs[1]);
				}
			}
		}
		if ($rss_2_date != "") $in_date = strtotime($rss_2_date);
		if ($in_date == "") $in_date = time();

		return $in_date;
	}


	function addContainsURIs($skel, $itemid, $uri, $text)
	{
		$uris = getFilteredURIsWithBase($text, $uri);

		debugtxt("== Adding:");
		debugarray($uris);

		$query = 'UPDATE overload_item ' .
			'SET hasuris=' . count($uris) . ' ' .
			'WHERE id=' . $itemid . ';';
		doQuery($skel, $query);
	
		for ($i = 0; $i < count($uris); $i++)
		{
			$insertquery = "INSERT INTO overload_containsuri SET itemid=" . $itemid . ", uri=\"" . mysql_real_escape_string($uris[$i]) . "\";";
			doQuery($skel, $insertquery);
		}

		debugtxt("== Added " . count($uris) . " uris for item " . $itemid);

		return count($uris);
	}

	function cleanContainsURIs($skel, $itemid)
	{
		$query = 'UPDATE overload_item ' .
			'SET hasuris=0 ' .
			'WHERE id=' . $itemid . ';';
		doQuery($skel, $query);
	
		$query = 'DELETE FROM overload_containsuri WHERE itemid=' . $itemid . ';';
		return doQuery($skel, $query);
	}
/*
	function updateRelatedItems($skel, $itemsWithLinks)
	{
		$query = 'INSERT INTO overload_isrelated ' .
			'SELECT itemid, relatedItemid FROM overload_containsuri, overload_containsuri AS cu2 '.
			'WHERE overload_containsuri.uri=cu2.uri ' .
			'AND (';
		for ($i = 0; $i < count($itemsWithLinks); $i++)
		{
			$query .= '(overload_containsuri.itemid=' . $itemsWithLinks[$i] . ' AND cu2.itemid!=' . $itemsWithLinks[$i] . ') ';
			if ($i < (count($itemsWithLinks) - 1))
			{
				$query .= 'OR ';
			}
		}
		$query .= ');';

		echo $query . "\n";

	}
*/

	function addEnclosure($skel, $itemid, $enclosure)
	{
		debugtxt("== Adding enclosure for item " . $itemid . ":");
		debugarray($enclosure);
		$query = 'INSERT INTO overload_enclosure ' .
			'SET itemid=' . $itemid . ', ' .
			'uri="' . mysql_real_escape_string($enclosure["url"]) . '", ' .
			'length=' . $enclosure["length"] . ', ' .
			'mimetype="' . mysql_real_escape_string($enclosure["type"]) . '";';
		return doQuery($skel, $query);
	}

	function removeEnclosures($skel, $itemid)
	{
		debugtxt("== Removing enclosures for item " . $itemid);
		$query = 'DELETE FROM overload_enclosure ' .
			'WHERE itemid=' . $itemid . ';';
		return doQuery($skel, $query);
	}
?>
