really first commit

web-archive-group · Feb 8, 2016 · 5db386b · 5db386b
commit 5db386b
Show file tree

Hide file tree

Showing 10 changed files with 165 additions and 0 deletions.
diff --git a/OCI-IFO/README.md b/OCI-IFO/README.md
@@ -0,0 +1,2 @@
+Some stuff
+# OCI-IFO
diff --git a/OCI-IFO/data_samples/article.txt b/OCI-IFO/data_samples/article.txt
@@ -0,0 +1,14 @@
+{
+  "_id": ObjectId("56b782be1d41c83ce0c7010b"),
+  "author": "Jon Azpiri",
+  "title": "2-year-old girl in Langley dies in ‘tragic accident’",
+  "full_article": "<p>A two-year-old girl who was found in a Langley pond has died.</p><p>Neighbours, RCMP and search and rescue crews scoured an area around Robertson Crescent early Saturday evening for signs of the girl after she was reported missing. Shortly after 7 p.m., they discovered the girl in a pond they had drained.</p><p>She was rushed to hospital in grave condition. RCMP said she has since passed away.</p><p>Police described the incident as a “tragic accident” and said there were no signs of foul play.</p><p>No names have been released.</p><p class=\"story-ad align-middle\"></p>",
+  "in_drupal": false,
+  "language": "en",
+  "media_url": "http://globalnews.ca/",
+  "original_article_url": "http://globalnews.ca/news/2502837/2-year-old-girl-in-langley-dies-in-tragic-accident/",
+  "published_date": "Sun, 07 Feb 2016 17:35:42 +0000",
+  "redirected_article_url": "http://globalnews.ca/news/2502837/2-year-old-girl-in-langley-dies-in-tragic-accident/",
+  "rss_url": "http://globalnews.ca/bc/feed/",
+  "scraped_on": NumberInt(1454867134)
+}
diff --git a/OCI-IFO/data_samples/media.txt b/OCI-IFO/data_samples/media.txt
@@ -0,0 +1,13 @@
+{
+  "_id": ObjectId("56993b2d1d41c8155edcc182"),
+  "original_website_url": "http://www.nationalpost.com",
+  "organisation_type": "media",
+  "wiki_url": "http://en.wikipedia.org/wiki/National_Post",
+  "dbpedia_url": "http://dbpedia.org/resource/National_Post",
+  "organisation": "National Post",
+  "avgcirculation": "92212.666666666666667",
+  "redirected_website_url": "http://www.nationalpost.com/index.html",
+  "scraped_on": NumberInt(1452882733),
+  "dbpedia_owner": "http://dbpedia.org/resource/Postmedia_Network",
+  "owner": "Postmedia Network Canada Corporation"
+}
diff --git a/OCI-IFO/data_samples/rss_feed.txt b/OCI-IFO/data_samples/rss_feed.txt
@@ -0,0 +1,8 @@
+{
+  "_id": ObjectId("56aa81491d41c866f058a0dc"),
+  "rss_url": "http://www.chroniclejournal.com/search/?c%5B%5D=news%2Flocal%2Cnews%2Flocal%2F%2A&d=&d1=&d2=&f=rss&l=100&nsa=eedition&q=&s=start_time&sd=desc&t=article",
+  "language": "en",
+  "valid_rss": NumberInt(1),
+  "scraped_on": NumberInt(1454014793),
+  "media_url": "http://www.chroniclejournal.com/"
+}
diff --git a/OCI-IFO/php/articles_by_keywords.php b/OCI-IFO/php/articles_by_keywords.php
@@ -0,0 +1,33 @@
+<?php
+
+try {
+// connection to MongoDB on the IFO server
+	$conn = new Mongo('mongodb://hackathon_reader:To0305@www.oci-ifo.org:27017/scrapy?authMechanism=MONGODB-CR');
+	$db = $conn->scrapy;
+	$collection = $db->articles;
+
+//Query to MongoDB
+//This query looks for a particular keyword in articles. "TransCanada" may be replaced by any keywords.
+	$articles = array(
+	'full_article' => array('$regex' => new MongoRegex("/^$TransCanada/i"))
+		);
+
+	$json = array();
+    $articles = $collection->find($articles)->limit(1000);
+    foreach ($articles as $obj => $value) {
+        $x = (array(
+        $obj => $value
+        )
+        );
+        $json[] = $x;
+        }
+        echo json_encode($json);
+
+// disconnect from server
+  $conn->close();
+} catch (MongoConnectionException $e) {
+  die('Error connecting to MongoDB server');
+} catch (MongoException $e) {
+  die('Error: ' . $e->getMessage());
+}
+?>
diff --git a/OCI-IFO/python/articles_by_author.py b/OCI-IFO/python/articles_by_author.py
@@ -0,0 +1,19 @@
+from pymongo import MongoClient
+
+# connection to MongoDB on the IFO server
+uri = "mongodb://hackathon_reader:To0305@www.oci-ifo.org:27017/scrapy?authMechanism=MONGODB-CR"
+client = MongoClient(uri)
+db = client.scrapy    
+collection = db.articles
+
+#uncomment to save output to file
+#f = open('data.txt', 'w') 
+
+#Query to MongoDB
+#This query outputs a list of articles wrtitten by an author.
+articles = collection.find({"author": "Ryan White" }).limit( 1000 )
+for x in articles:
+    print x
+#uncomment to save output to file
+#    print  >>f, x 
+
diff --git a/OCI-IFO/python/articles_by_keywords.py b/OCI-IFO/python/articles_by_keywords.py
@@ -0,0 +1,19 @@
+from pymongo import MongoClient
+
+# connection to MongoDB on the IFO server
+uri = "mongodb://hackathon_reader:To0305@www.oci-ifo.org:27017/scrapy?authMechanism=MONGODB-CR"
+client = MongoClient(uri)
+db = client.scrapy    
+collection = db.articles
+
+#uncomment to save output to file
+#f = open('data.txt', 'w') 
+
+#Query to MongoDB
+#This query looks for a particular keyword in articles. TransCanada may be replaced by anything
+articles = collection.find({"full_article": {'$regex':'TransCanada'}}).limit( 1000 )
+for x in articles:
+    print x
+#uncomment to save output to file
+#    print  >>f, x 
+
diff --git a/OCI-IFO/python/articles_by_media.py b/OCI-IFO/python/articles_by_media.py
@@ -0,0 +1,19 @@
+from pymongo import MongoClient
+
+# connection to MongoDB on the IFO server
+uri = "mongodb://hackathon_reader:To0305@www.oci-ifo.org:27017/scrapy?authMechanism=MONGODB-CR"
+client = MongoClient(uri)
+db = client.scrapy    
+collection = db.articles
+
+#uncomment to save output to file
+#f = open('data.txt', 'w') 
+
+#Query to MongoDB
+#This query outputs a list of articles published by a media.
+articles = collection.find({"media_url": "http://www.thestar.com/" }).limit( 1000 )
+for x in articles:
+    print x
+#uncomment to save output to file
+#    print  >>f, x 
+
diff --git a/OCI-IFO/python/media_list.py b/OCI-IFO/python/media_list.py
@@ -0,0 +1,19 @@
+from pymongo import MongoClient
+
+# connection to MongoDB on the IFO server
+uri = "mongodb://hackathon_reader:To0305@www.oci-ifo.org:27017/scrapy?authMechanism=MONGODB-CR"
+client = MongoClient(uri)
+db = client.scrapy    
+collection = db.websites
+
+#uncomment to save output to file
+#f = open('data.txt', 'w') 
+
+#Query to MongoDB
+#This query outputs a list media in the database.
+media = collection.find({"original_website_url": {"$exists" : 1}}).limit( 1000 )
+for x in media:
+    print x
+#uncomment to save output to file
+#    print  >>f, x 
+
diff --git a/OCI-IFO/python/media_newspapers_by_circulation.py b/OCI-IFO/python/media_newspapers_by_circulation.py
@@ -0,0 +1,19 @@
+from pymongo import MongoClient
+
+# connection to MongoDB on the IFO server
+uri = "mongodb://hackathon_reader:To0305@www.oci-ifo.org:27017/scrapy?authMechanism=MONGODB-CR"
+client = MongoClient(uri)
+db = client.scrapy    
+collection = db.websites
+
+#uncomment to save output to file
+#f = open('data.txt', 'w') 
+
+#Query to MongoDB
+#This query outputs a list newspapers and their circulation.
+media = collection.find({"avgcirculation": {"$ne" : None}}).limit( 1000 )
+for x in media:
+    print x
+#uncomment to save output to file
+#    print  >>f, x 
+