summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scripts/cron.sh23
-rw-r--r--scripts/split_json.py20
2 files changed, 43 insertions, 0 deletions
diff --git a/scripts/cron.sh b/scripts/cron.sh
new file mode 100644
index 0000000..bf219be
--- /dev/null
+++ b/scripts/cron.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+DIR="$( cd "$( dirname "$0" )" && pwd )"
+cd $DIR
+
+python database_updater.py
+
+source ../../../bin/activate
+# this is for the test server. Might differ on different machines. Ideally it should be "source ../../bin/activate"
+
+
+
+cd crawler/
+
+scrapy crawl tbc_spider -o items.json -t json
+#sadly scrapy can only be run in the folders containing scrapy.cfg
+
+cd ../.
+
+python split_json.py
+
+deactivate
+
diff --git a/scripts/split_json.py b/scripts/split_json.py
new file mode 100644
index 0000000..b829c23
--- /dev/null
+++ b/scripts/split_json.py
@@ -0,0 +1,20 @@
+import cPickle
+import json
+from os.path import dirname, abspath,join
+try:
+ with open('crawler/items.json', "r") as json_dump:
+ json_data = json.load(json_dump)
+ json_dump.close()
+ a = [saved_data for saved_data in json_data if str(saved_data).startswith("{u'ch")]
+ with open(join(dirname(abspath(dirname(__file__))),'tbc_error_page/error.json'), "w+") as error_json:
+ cPickle.dump(a, error_json)
+ error_json.close()
+
+ b = [saved_data for saved_data in json_data if str(saved_data).startswith("{u'br")]
+ with open(join(dirname(abspath(dirname(__file__))),'tbc_error_page/broken.json'), "w+") as broken_json:
+ cPickle.dump(b, broken_json)
+ broken_json.close()
+
+
+except ValueError:
+ print "fail"