#!/usr/bin/env python
# -*- coding: utf-8 -*-
# $Id: Pigeon.py 2052 2018-10-17 19:43:30Z Lavender $
#
# Copyright (c) 2018 Nuwa Information Co., Ltd, All Rights Reserved.
#
# Licensed under the Proprietary License, 
# you may not use this file except in compliance with the License. 
# You may obtain a copy of the License at our web site.
#
# See the License for the specific language governing permissions and 
# limitations under the License.
#
# $Author: Lavender $
# $Date: 2018-10-18 04:43:30 +0900 (週四, 18 十月 2018) $
# $Revision: 2052 $ 

import os
import logging
import datetime
import hashlib
import json

from functools import wraps

import scrapy

STATIC_DIR = os.path.join('static')

def saveHtml(func):
    @wraps(func)
    def save(spider, req, *args, **kwargs):
        if (isinstance(req, scrapy.http.Response) and 
            isinstance(spider, scrapy.Spider)):
            if not spider.crawlDate:
                #print req, spider
                # build dir
                directory = os.path.join(
                    STATIC_DIR, str(spider.TODAY), spider.name)

                if not os.path.exists(directory):
                    os.makedirs(directory)

                # write html file
                filename = "%s.html" % hashlib.md5(req.url).hexdigest()
                filePath = os.path.join(directory, filename)
                with open(filePath, 'wb') as f:
                    f.write(req.body)

                # update json
                jsonName = "%s.json" % spider.name
                jsonPath = os.path.join(directory, jsonName)
                if os.path.isfile(jsonPath):
                    with open(jsonPath, 'r') as f:
                        urlDict = json.loads(f.read())
                else:
                    urlDict = {}
                urlDict.update({
                    req.url: filename,
                })
                with open(jsonPath, 'w') as f:
                    f.write(json.dumps(urlDict, indent=4))
        
        return func(spider, req, *args, **kwargs)
    return save

class Request(scrapy.Request):
    def __init__(self, spider, url, callback=None, method='GET', headers=None, 
                 body=None, cookies=None, meta=None, encoding='utf-8', 
                 priority=0, dont_filter=False, errback=None, flags=None, ):

        if spider.crawlDate:
            directory = os.path.join(
                STATIC_DIR, str(spider.crawlDate), spider.name)
            jsonName = "%s.json" % spider.name
            jsonPath = os.path.join(directory, jsonName)
            with open(jsonPath, 'r') as f:
                urlDict = json.loads(f.read())
            fileName = urlDict.get(url)
            if fileName:
                filePath = os.path.join(directory, fileName)
                localUrl = "http://127.0.0.1:%s/%s" % (
                            spider.crawlPort, filePath.replace("\\", "/"))
                url = localUrl
            else:
                raise KeyError(
                    "No date record for %s: %s" % (url, spider.crawlDate))

        rt = super(Request, self).__init__(url, callback=callback, 
                method=method, headers=headers, body=body, cookies=cookies, 
                meta=meta, encoding=encoding, priority=priority, 
                dont_filter=dont_filter, errback=errback, flags=flags)

class BaseSpider(scrapy.Spider):

    NOW = datetime.datetime.now()
    TODAY = datetime.date.today()

    def __init__(self, name=None, crawlDate=None, crawlPort=8000, **kwargs):
        if crawlDate:
            self.allowed_domains.append("127.0.0.1")
            self.crawlDate = \
                datetime.datetime.strptime(crawlDate, "%Y-%m-%d").date()
        else:
            self.crawlDate = None
        self.crawlPort = crawlPort 

        rt = super(BaseSpider, self).__init__(name=name, **kwargs)

        if crawlDate:
            directory = os.path.join(
                STATIC_DIR, str(self.crawlDate), self.name)
            if not os.path.isdir(directory):
                raise KeyError("No date record: %s" % self.crawlDate)
        self.logger.info("----- Start %s %s -----" % (self.name, self.NOW))
        return rt

    @property
    def logger(self):
        logger = logging.getLogger(self.name)
        logger.setLevel(logging.DEBUG)
        handler = logging.StreamHandler()
        handler.setFormatter(
            logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
        logger.addHandler(handler)
        fileHandler = logging.FileHandler("%s.log" % self.name)
        fileHandler.setLevel(logging.WARNING)
        logger.addHandler(fileHandler)
        return logging.LoggerAdapter(logger, {'spider': self})