python爬取猫眼影片数据

应公司要求,本次用python爬取猫眼影片数据几万条

tips:
猫眼电影请求过多就会屏蔽一次IP地址
猫眼电影列表里有重复的影片的数据

一、前期准备工作
1、相关数据表结构

CREATE TABLE `sx_movie` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `movie_title` varchar(255) NOT NULL DEFAULT '' COMMENT '影片标题',
  `movie_alias` varchar(255) NOT NULL DEFAULT '' COMMENT '影片别名',
  `movie_tag` json NOT NULL COMMENT '影片标签',
  `movie_poster` varchar(255) DEFAULT '' COMMENT '影片海报',
  `movie_starring` json NOT NULL COMMENT '影片主演',
  `movie_len` int(11) unsigned NOT NULL DEFAULT '0' COMMENT '影片长度,单位分',
  `movie_intro` text NOT NULL COMMENT '影片介绍',
  `movie_score` tinyint(3) unsigned NOT NULL DEFAULT '0' COMMENT '影片得分,100分制',
  `movie_area` json NOT NULL COMMENT '影片产地国家/地区ID',
  `movie_type` json NOT NULL COMMENT '影评类型',
  `is_hot` tinyint(1) unsigned NOT NULL DEFAULT '0' COMMENT '是否热门,1=是,0=否',
  `user_num` int(10) unsigned NOT NULL DEFAULT '0' COMMENT '影评人数',
  `comment_num` int(11) unsigned NOT NULL DEFAULT '0' COMMENT '影片评论数',
  `movie_status` tinyint(1) unsigned NOT NULL DEFAULT '1' COMMENT '影片状态,1=正常,3=系统删除',
  `deleted_at` int(10) unsigned NOT NULL DEFAULT '0',
  `released_at` int(10) NOT NULL DEFAULT '0' COMMENT '大陆上映时间',
  `created_at` int(11) unsigned NOT NULL DEFAULT '0',
  `updated_at` int(11) unsigned NOT NULL DEFAULT '0',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COMMENT='电影列表';
CREATE TABLE `sx_maoyan` (
  `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
  `my_id` bigint(20) unsigned NOT NULL DEFAULT '0' COMMENT '猫眼影片ID',
  `movie_title` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT '影片标题',
  `movie_alias` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT '影片别名',
  `movie_tag` json NOT NULL COMMENT '影片标签',
  `movie_poster` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT '影片海报',
  `movie_len` smallint(6) NOT NULL DEFAULT '0' COMMENT '影片长度,单位:分钟',
  `movie_intro` text COLLATE utf8mb4_unicode_ci NOT NULL COMMENT '影片介绍',
  `movie_area` json NOT NULL COMMENT '影片产地国家/地区',
  `movie_type` json NOT NULL COMMENT '影评类型',
  `movie_starring` json NOT NULL COMMENT '影片演员',
  `release_at` varchar(25) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '0' COMMENT '上映时间',
  `created_at` int(10) unsigned NOT NULL COMMENT '创建时间',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='猫眼电影';

二、爬取数据
Step 1:爬取猫眼电影ID和标题

# -*- coding: utf-8 -*
import requests
from requests.exceptions import RequestException
import re
import json
import MySQLdb
import time
import sys


def get_one_page(url):
    '''
    获取网页内容
    :param url: 网页地址
    :return: 网页内容
    '''

    try:
        # 获取页码内容
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'  # 代理浏览器
                          + 'Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
        }
        response = requests.get(url, headers=headers)
        # 通过状态判断是否获取成功
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    '''
      <div class="channel-detail movie-item-title" title="惊奇队长">
      <a href="/films/341139" target="_blank" data-act="movies-click" data-val="{movieId:341139}">惊奇队长</a>
    </div>
    :param html:
    :return:
    '''

    pattern = re.compile('<div class="channel-detail movie-item-title" title="(.*?)">.*?movieId:(.*?)}"', re.S)

    items = re.findall(pattern, html)

    print(items)
    for item in items:
        yield {
            'id': item[1],
            'title': item[0],
        }


def main():
    db = MySQLdb.connect('127.0.0.1', 'root', sys.argv[1], 'shengxi_v2')
    cursor = db.cursor()
    url = 'http://maoyan.com/films?showType=2'

    cursor.execute("truncate table sx_maoyan")

    for i in range(0, 9):
        offset = i * 30
        weburl = url + '&offset=' + offset.__str__()
        print(weburl)
        html = get_one_page(weburl)
        for item in parse_one_page(html):
            print(item)
            insert_table(item, db, cursor)


def insert_table(item, db, cursor):
    upsql = "insert into sx_maoyan set my_id='%s',movie_title='%s',movie_alias='%s'," \
            "movie_tag ='%s',movie_poster='%s',movie_intro='%s',movie_area='%s',movie_type='%s', created_at = %d" \
            % (str(item['id']), str(item['title']), '', json.dumps([]), '', '', json.dumps([]), json.dumps([]),
               int(time.time()))
    print(upsql)
    cursor.execute(upsql)
    db.commit()


if __name__ == '__main__':
    main()

Step 2:获取影片详细信息

# -*- coding: utf-8 -*
import requests
from requests.exceptions import RequestException
import re
import json
import MySQLdb
import sys


def get_one_page(url):
    '''
    获取网页内容
    :param url: 网页地址
    :return: 网页内容
    '''

    try:
        # 获取页码内容
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'  # 代理浏览器
                          + 'Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
        }
        response = requests.get(url, headers=headers)
        # 通过状态判断是否获取成功
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    '''
      <div class="channel-detail movie-item-title" title="惊奇队长">
      <a href="/films/341139" target="_blank" data-act="movies-click" data-val="{movieId:341139}">惊奇队长</a>
    </div>
    :param html:
    :return:
    '''

    pattern = re.compile(
        '<img class="avatar" src="(.*?)".*?<h3 class="name">(.*?)</h3>.*?<div class="ename ellipsis">(.*?)</div>.*?<li class="ellipsis">(.*?)</li>.*?'
        '<li class="ellipsis">(.*?)</li>.*?<li class="ellipsis">(.*?)</li>.*?<span class="dra">(.*?)</span>',
        re.S)

    items = re.findall(pattern, html)

    for item in items:
        area = item[4].replace(' ', '').replace('\n', '')
        if '/' in area:
            areainfo = area.split('/')
            area = areainfo[0]
            length = areainfo[1][0:-2]
        else:
            length = 0

        yield {
            'image': item[0],
            'name': item[1],
            'ename': item[2],
            'type': item[3],
            'area': area,
            'length': length,
            'release': item[5][0:-4],
            'desc': item[6],
        }


def main():
    db = MySQLdb.connect('127.0.0.1', 'root', sys.argv[1], 'shengxi_v2')
    cursor = db.cursor()
    sql = """select * from sx_maoyan"""
    cursor.execute(sql)

    results = cursor.fetchall()
    for row in results:
        url = 'http://maoyan.com/films/' + str(row[1])
        print(url)
        html = get_one_page(url)
        print(html)
        if not html is None:
            for item in parse_one_page(html):
                upsql = "update sx_maoyan set movie_alias='%s',movie_poster='%s',`movie_type`='%s',movie_area='%s'," \
                        "`movie_len`='%s',release_at='%s',`movie_intro`='%s',movie_starring ='%s' where id = %d" \
                        % (item['ename'], item['image'], json.dumps(item['type'].split(','), ensure_ascii=False),
                           json.dumps(item['area'].split(','), ensure_ascii=False), item['length'],
                           item['release'],
                           item['desc'], json.dumps([], ensure_ascii=False), row[0])
                cursor.execute(upsql)
                db.commit()


if __name__ == '__main__':
    main()

Step 3:

# -*- coding: utf-8 -*
import requests
from requests.exceptions import RequestException
import re
import json
import sys
import MySQLdb


def get_one_page(url):
    '''
    获取网页内容
    :param url: 网页地址
    :return: 网页内容
    '''

    try:
        # 获取页码内容
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'  # 代理浏览器
                          + 'Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
        }
        response = requests.get(url, headers=headers)
        # 通过状态判断是否获取成功
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    '''
      <div class="channel-detail movie-item-title" title="惊奇队长">
      <a href="/films/341139" target="_blank" data-act="movies-click" data-val="{movieId:341139}">惊奇队长</a>
    </div>
    :param html:
    :return:
    '''

    pattern = re.compile(
        '<div class="celebrity-type">.*演员.*?<span class="num">.*?<div class="info">.*?'
        'class="name">(.*?)</a>.*?div class="info">.*?class="name">(.*?)</a>.*?div class="info">.*?class="name">(.*?)</a>.*?div class="info">.*?class="name">(.*?)</a>',
        re.S)

    items = re.findall(pattern, html)
    for item in items:
        yield {
            item[0].replace(' ', '').replace('\n', ''),
            item[1].replace(' ', '').replace('\n', ''),
            item[2].replace(' ', '').replace('\n', ''),
            item[3].replace(' ', '').replace('\n', ''),
        }


def main():
    db = MySQLdb.connect('127.0.0.1', 'root', sys.argv[1], 'shengxi_v2')

    cursor = db.cursor()
    sql = """select * from sx_maoyan order by id asc"""
    cursor.execute(sql)

    results = cursor.fetchall()
    for row in results:
        usql = "update sx_maoyan set movie_starring = '%s' where id = %d" % (json.dumps([]), row[0])
        cursor.execute(usql)
        db.commit()
        url = 'http://maoyan.com/films/' + str(row[1])
        print(url)

        html = get_one_page(url)
        for item in parse_one_page(html):
            print(item)
            upsql = "update sx_maoyan set movie_starring ='%s' where id = %d" % (
                json.dumps(list(item), ensure_ascii=False), row[0])

            cursor.execute(upsql)
            db.commit()


if __name__ == '__main__':
    main()

Leave Comment