By东关

python爬取京东商品代码(图片、价格等信息)
2021-05-25

# -*- coding:utf-8 -*-
import requests
import re
import random
import time
import json
import pymysql
import os
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import urllib
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)  ###禁止提醒SSL警告


class jd(object):
    def __init__(self):

        self.s = requests.session()   ## 创建一个session对象
        headers = {
            'accept':'application/json, text/javascript, */*; q=0.01',
            'accept-encoding':'gzip, deflate, br',
            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
                   }
        self.s.headers.update(headers)   ### 设置请求头

    def getdata(self,url,name):

        getdate=time.strftime("%Y-%m-%d",time.localtime())
        self.shopid=re.search('index-(.*?).html',url).group(1)    ###获取店铺ID号
        self.s.get('https://shop.m.jd.com/search/search?shopId='+str(self.shopid))

        for i in range(2,10000):   ###爬取页数范围   没有找到商品后会自动退出循环
            time.sleep(1+random.random())  ##随机延时0-1秒
            t = int(time.time() * 1000)
            searchurl = 'https://wqsou.jd.com/search/searchjson?datatype=1&page={}&pagesize=40&merge_sku=yes&qp_disable=yes&key=ids%2C%2C{}&_={}&sceneval=2&g_login_type=1&callback=jsonpCBKA&g_ty=ls'.format(i,self.shopid,t)  ##请求数据网址
            print(searchurl)
            req=self.s.get(url=searchurl,verify=False).text   ###获取数据

            try:

                req = req[10:-2]

                rateList = json.loads(req.replace('\\', '\\\\')).get('data').get('searchm').get('Paragraph')

                if len(rateList)==0:
                    return

                db = pymysql.connect(host="127.0.0.1",
                                     user="jd",
                                     password="123456",
                                     port=3306,  # 端口
                                     database="jd",
                                     charset='utf8')
                cursor = db.cursor()
                for route in rateList:
                    name = route.get('Content').get('warename')
                    dredisprice = route.get('dredisprice')
                    imageurl = route.get('Content').get('imageurl')
                    url = self.getimg(imageurl)
                    jd_url = "https://item.jd.com/"+route.get('wareid')+".html"
                    jd_m_url = "https://item.m.jd.com/product/"+route.get('wareid')+".html"
                    headers2 = {
                        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
                    }
                    session = requests.session()
                    r = session.get(jd_m_url, headers=headers2)
                    with open(r"JdContent.txt", 'a+', encoding='UTF-8') as file:
                        file.write(r.text + ' ')

                    content = re.findall('"image":\[(.*?)\]', r.text)[0]
                    if content:
                        images = "["+content+"]"
                    else:
                        images = []
                    images = []
                    sql = "INSERT INTO tp_items_jd(name, price, icon, icon2, url, images, jd_id) VALUES ('{name}','{price}','{icon}','{icon2}','{url}','{images}','{jd_id}') ".format(name=name, price=dredisprice, icon=url, icon2=url, url=jd_url, images=images, jd_id=route.get('wareid'))
                    print(sql)
                    try:
                        cursor.execute(sql)
                        # 提交到数据库执行
                        db.commit()
                    except Exception as e:
                        # 发生错误时回滚
                        print(e.args)
                        db.rollback()

            except Exception as e:
                # 发生错误时回滚
                print(e.args)
                with open('content.log', 'wb') as file:
                    file.write(str.encode(req))
                    file.close()
                return

    def getimg(self, imageurl):
        re = ''
        now = int(time.time())
        timeArray = time.localtime(now)
        try:
            res = urllib.request.urlopen('https://img10.360buyimg.com/n7/s590x590_' + imageurl,
                                         timeout=5).read()
            day = time.strftime("%Y%m%d%H%M", timeArray)
            ran =  random.randint(10000,99999)
            url = '/data/upload/img/jdimg/'+str(day)+'/'

            if not os.path.exists(url):
                os.makedirs(url)
            filename = url+str(ran)+'.jpg'
            re = '/img/jdimg/' + str(day) + '/'+str(ran)+'.jpg'
            with open(filename, 'wb') as file:
                file.write(res)
                file.close()
        except Exception as e:
            print(str(e))
            print('')

        return re

if __name__ == '__main__':

    urls = [
        "https://mall.jd.com/index-1000000140.html?from=pc", //店铺
    ]
    for i in urls:
        url = i
        nm='intel'
        j = jd()
        j.getdata(url,nm)

你必须 登录 才能发表评论.

  • 还没有人留下脚印噢,快来踩踩叭