From cf1209063bf68e01d312bc870280cf43b2a616a9 Mon Sep 17 00:00:00 2001 From: mahesh Date: Fri, 1 Apr 2016 01:41:16 +0530 Subject: A crawler to crawl the tbc website to find errors on tbc notebooks and broken links --- scripts/crawler/tbc_web_crawler/spiders/items.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 scripts/crawler/tbc_web_crawler/spiders/items.py (limited to 'scripts/crawler/tbc_web_crawler/spiders/items.py') diff --git a/scripts/crawler/tbc_web_crawler/spiders/items.py b/scripts/crawler/tbc_web_crawler/spiders/items.py new file mode 100644 index 0000000..9dda20f --- /dev/null +++ b/scripts/crawler/tbc_web_crawler/spiders/items.py @@ -0,0 +1,18 @@ +import scrapy + + +class TbcErrorItems(scrapy.Item): + + + chapter_name = scrapy.Field() + chapter_urls = scrapy.Field() + completed_book_urls = scrapy.Field() + number_of_errors = scrapy.Field() + error_messages = scrapy.Field() + + + +class TbcBrokenItems(scrapy.Item): + + broken_url = scrapy.Field() + broken_status = scrapy.Field() -- cgit