diff options
-rw-r--r-- | scripts/cron.sh | 23 | ||||
-rw-r--r-- | scripts/split_json.py | 20 |
2 files changed, 43 insertions, 0 deletions
diff --git a/scripts/cron.sh b/scripts/cron.sh new file mode 100644 index 0000000..bf219be --- /dev/null +++ b/scripts/cron.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +DIR="$( cd "$( dirname "$0" )" && pwd )" +cd $DIR + +python database_updater.py + +source ../../../bin/activate +# this is for the test server. Might differ on different machines. Ideally it should be "source ../../bin/activate" + + + +cd crawler/ + +scrapy crawl tbc_spider -o items.json -t json +#sadly scrapy can only be run in the folders containing scrapy.cfg + +cd ../. + +python split_json.py + +deactivate + diff --git a/scripts/split_json.py b/scripts/split_json.py new file mode 100644 index 0000000..b829c23 --- /dev/null +++ b/scripts/split_json.py @@ -0,0 +1,20 @@ +import cPickle +import json +from os.path import dirname, abspath,join +try: + with open('crawler/items.json', "r") as json_dump: + json_data = json.load(json_dump) + json_dump.close() + a = [saved_data for saved_data in json_data if str(saved_data).startswith("{u'ch")] + with open(join(dirname(abspath(dirname(__file__))),'tbc_error_page/error.json'), "w+") as error_json: + cPickle.dump(a, error_json) + error_json.close() + + b = [saved_data for saved_data in json_data if str(saved_data).startswith("{u'br")] + with open(join(dirname(abspath(dirname(__file__))),'tbc_error_page/broken.json'), "w+") as broken_json: + cPickle.dump(b, broken_json) + broken_json.close() + + +except ValueError: + print "fail" |