Scrapy Download Helper FTP for Directory Listing Support
Scrapy default ftp download helper is perfect for file download over FTP. Sometimes FTP directory listing is required & default ftp download healper doesn't work at all.
If directory list is required - give a try & let me know whether it works or not. Do you have any better solution or any improvement suggestion.
Lets open a file project_root/project/ftp.py
with a following content
# -*- coding: utf-8 -*-
import re
from io import BytesIO
from six.moves.urllib.parse import urlparse, unquote
from twisted.internet import reactor
from twisted.protocols.ftp import FTPClient, CommandFailed, FTPFileListProtocol
from twisted.internet.protocol import Protocol, ClientCreator
from scrapy.http import Response
from scrapy.responsetypes import responsetypes
from scrapy.core.downloader.handlers.ftp import ReceivedDataProtocol
class ReceivedData(object):
def __init__(self):
self.body = BytesIO()
self.is_file = False
self.size = 0
def dataReceived(self, data):
self.body.write(data)
self.size += len(data)
@property
def filename(self):
return None
def close(self):
self.body.seek(0)
_CODE_RE = re.compile("\d+")
class FTPDownloadHandler(object):
CODE_MAPPING = {
"550": 404,
"default": 503,
}
def __init__(self, setting):
self.dir_listing = False
def download_request(self, request, spider):
parsed_url = urlparse(request.url)
creator = ClientCreator(reactor, FTPClient, request.meta["ftp_user"],
request.meta["ftp_password"],
passive=request.meta.get("ftp_passive", 0))
return creator.connectTCP(parsed_url.hostname, parsed_url.port or 21).addCallback(self.gotClient,
request, unquote(parsed_url.path))
def gotClient(self, client, request, filepath):
self.client = client
self.filepath = filepath
protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename", None))
return client.retrieveFile(filepath, protocol)\
.addCallbacks(callback=self._build_response,
callbackArgs=(request, protocol),
errback=self._file_failed,
errbackArgs=(request,))
def _build_response(self, result, request, protocol):
self.result = result
respcls = responsetypes.from_args(url=request.url)
protocol.close()
body = protocol.filename or protocol.body.read()
headers = {"local filename": protocol.filename or '', "size": protocol.size, "filetype": ("d" if self.dir_listing else "")}
return respcls(url=request.url, status=200, body=body, headers=headers)
def _file_failed(self, result, request):
self.dir_listing = True
fileList = FTPFileListProtocol()
self.filepath = self.filepath or "."
return self.client.list(self.filepath, fileList)\
.addCallbacks(callback=self._prepare_data,
callbackArgs=(request, fileList),
errback=self._failed,
errbackArgs=(request,))
def _prepare_data(self, result, request, fileList):
protocol = ReceivedData()
for file in fileList.files:
protocol.dataReceived('{0}::{1}::{2}::{3}::{4}\n'.format(file['filetype'], file['filename'], file['size'],
file['nlinks'], file['date']).encode("utf-8"))
protocol.close()
return self._build_response(result, request, protocol)
def _failed(self, result, request):
message = result.getErrorMessage()
if result.type == CommandFailed:
m = _CODE_RE.search(message)
if m:
ftpcode = m.group()
httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
return Response(url=request.url, status=httpcode, body=message)
raise result.type(result.value)
Edit project_root/project/settings.py
and add the following lines
DOWNLOAD_HANDLERS_BASE = {
'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler',
'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
'ftp': 'project.ftp.FTPDownloadHandler',
}
Thats all.