All posts All posts by this author Paper color Change page color Announcements

Scrapy Download Helper FTP for Directory Listing Support

Scrapy default ftp download helper is perfect for file download over FTP. Sometimes FTP directory listing is required & default ftp download healper doesn't work at all.

If directory list is required - give a try & let me know whether it works or not. Do you have any better solution or any improvement suggestion.

Lets open a file project_root/project/ftp.py with a following content

# -*- coding: utf-8 -*-

import re
from io import BytesIO
from six.moves.urllib.parse import urlparse, unquote

from twisted.internet import reactor
from twisted.protocols.ftp import FTPClient, CommandFailed, FTPFileListProtocol
from twisted.internet.protocol import Protocol, ClientCreator

from scrapy.http import Response
from scrapy.responsetypes import responsetypes


from scrapy.core.downloader.handlers.ftp import ReceivedDataProtocol


class ReceivedData(object):

    def __init__(self):
        self.body = BytesIO()
        self.is_file = False
        self.size = 0

    def dataReceived(self, data):
        self.body.write(data)
        self.size += len(data)

    @property
    def filename(self):
        return None

    def close(self):
        self.body.seek(0)



_CODE_RE = re.compile("\d+")
class FTPDownloadHandler(object):

    CODE_MAPPING = {
        "550": 404,
        "default": 503,
    }

    def __init__(self, setting):
        self.dir_listing = False

    def download_request(self, request, spider):
        parsed_url = urlparse(request.url)
        creator = ClientCreator(reactor, FTPClient, request.meta["ftp_user"],
                                    request.meta["ftp_password"],
                                    passive=request.meta.get("ftp_passive", 0))
        return creator.connectTCP(parsed_url.hostname, parsed_url.port or 21).addCallback(self.gotClient,
                                request, unquote(parsed_url.path))

    def gotClient(self, client, request, filepath):
        self.client = client
        self.filepath = filepath
        protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename", None))
        return client.retrieveFile(filepath, protocol)\
                .addCallbacks(callback=self._build_response,
                        callbackArgs=(request, protocol),
                        errback=self._file_failed,
                        errbackArgs=(request,))

    def _build_response(self, result, request, protocol):
        self.result = result
        respcls = responsetypes.from_args(url=request.url)
        protocol.close()
        body = protocol.filename or protocol.body.read()
        headers = {"local filename": protocol.filename or '', "size": protocol.size, "filetype": ("d" if self.dir_listing else "")}
        return respcls(url=request.url, status=200, body=body, headers=headers)

    def _file_failed(self, result, request):
        self.dir_listing = True
        fileList = FTPFileListProtocol()
        self.filepath = self.filepath or "."
        return self.client.list(self.filepath, fileList)\
                .addCallbacks(callback=self._prepare_data,
                        callbackArgs=(request, fileList),
                        errback=self._failed,
                        errbackArgs=(request,))

    def _prepare_data(self, result, request, fileList):
        protocol = ReceivedData()
        for file in fileList.files:
            protocol.dataReceived('{0}::{1}::{2}::{3}::{4}\n'.format(file['filetype'], file['filename'], file['size'],
                                                                     file['nlinks'], file['date']).encode("utf-8"))
        protocol.close()
        return self._build_response(result, request, protocol)

    def _failed(self, result, request):
        message = result.getErrorMessage()
        if result.type == CommandFailed:
            m = _CODE_RE.search(message)
            if m:
                ftpcode = m.group()
                httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
                return Response(url=request.url, status=httpcode, body=message)
        raise result.type(result.value)

Edit project_root/project/settings.py and add the following lines

DOWNLOAD_HANDLERS_BASE = {
    'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler',
    'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
    'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
    's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
    'ftp': 'project.ftp.FTPDownloadHandler',
}

Thats all.