Press "Enter" to skip to content

python 学习之 requests + pyquery 实现图片抓取

requests

一个给人类用的 http 请求库

pyquery

一个以 jquery 选择器语法为基础的 html 过滤库

#!/usr/bin/env python3
# _*_ coding: utf-8 _*_

import requests, os, logging
from pyquery import PyQuery


class Spider(object):

    def __parse(self, doc):
        query = PyQuery(doc)

        return query

    def __request(self, url):

        return requests.get(url)

    def __save(self, filename, content):
        folder = './image'
        filename = folder + '/' + filename
        if not os.path.isdir(folder):
            os.mkdir(folder)

        if os.path.isfile(filename) and filename:
            # print(filename)
            os.mknod(filename)  # 空文件

        with open(filename, 'wb') as file:
            file.write(content)

    def __correct_url(self, url):
        if not url.startswith('http'):
            url = 'http://' + url.strip('//')

        return url

    def handle(self, url):
        try:
            response = self.__request(url)
            if response.status_code == 200:

                counter = 0
                html = self.__parse(response.text)

                for img in html('img').items():

                    url = self.__correct_url(img.attr('src'))
                    filename = 'image' + str(counter) + '.' + url.split('.')[-1]  # 图片后缀(list 最后一个元素)

                    resource = self.__request(url)  # 请求图片

                    if resource.status_code == 200:
                        self.__save(filename, resource.content)
                        print(filename + '=> ok')
                        counter += 1

        except BaseException as e:

            print(logging.exception(e))


spider = Spider()
spider.handle('https://topic.autohome.com.cn/new/marketing/2018/12/izoa/')

Be First to Comment

发表评论

电子邮件地址不会被公开。 必填项已用*标注