多应用+插件架构,代码干净,二开方便,首家独创一键云编译技术,文档视频完善,免费商用码云13.8K 广告
我们可以在`settings.py`中做一些配置参数,更多配置参考官网:https://scrapy-chs.readthedocs.io/zh_CN/1.0/topics/settings.html 。 ```python ############ settings.py 常用参数参考 ################ # 控制日志级别 (ERROR WARNING INFO DEBUG) LOG_LEVEL = 'DEBUG' # 将日志写到log文件中 LOG_FILE = 'scrapy.log' # 自定义超时时间 DOWNLOAD_TIMEOUT = 15 # 不遵守 robots 协议 ROBOTSTXT_OBEY = False # 不需要cookie时,尽量废除他 COOKIES_ENABLED = False # 打印cookie到控制台上 COOKIES_DEBUG = True # 配置 telnet 控制台,功能强大 TELNETCONSOLE_USERNAME = 'xxxxxx' TELNETCONSOLE_PASSWORD = 'xxxxxx' TELNETCONSOLE_PORT = 6666 TELNETCONSOLE_HOST = '0.0.0.0' # 自定义请求头 DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9', } # 自定义UA,大规模爬虫请去中间件里面设置 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' # ======================================================================= # 启动‘自动掐死’插件后,一般不用管这个了 # DOWNLOAD_DELAY = 1 # CONCURRENT_REQUESTS = 16 # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 16 # ======================================================================= # 自动流量控制 AUTOTHROTTLE_ENABLED = True AUTOTHROTTLE_START_DELAY = 0 AUTOTHROTTLE_MAX_DELAY = 30 # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # AUTOTHROTTLE_DEBUG = False # ======================================================================= # http缓存,把请求和响应缓存到本地,很适合调试时候使用。 HTTPCACHE_ENABLED = True HTTPCACHE_EXPIRATION_SECS = 0 HTTPCACHE_DIR = 'httpcache' # HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ```