Python爬虫,获取,解析,存储详解

 

1.获取数据

import requests
def drg(url):
  try:
      head ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/\
     537.36 (KHTML, like Gecko) Chrome/\
     91.0.4472.164 Safari/537.36'}
      r = requests.get(url,headers=head)
      r.raise_for_status()  # 如果状态不是200,引发HTTPError异常
      r.encoding = r.apparent_encoding
      return r.text
  except:
      return "产生异常"
url = "https://www.ip138.com/mobile.asp?mobile=13018305773&action=mobile"
print(drg(url))

 

2.解析数据

import requests
def login():
  try:
      # 登录之后界面的url
      urllogin="http://www.cqooc.com/user/login?username=12608199000635&password=48C032612C2A6777D28A969307B52127E198D59AA78522943C1B283CF7B89E69&nonce=6BA36BBB1F623279&cnonce=8257070573EFE28F"
      s=requests.session()
      r=s.post(urllogin,data=Form,headers=headers)
      r.encoding = r.apparent_encoding
      r.raise_for_status()
      return s
  except Exception as error:
      print(error)
def get_html(s,url):
  try:
      r=s.get(url,headers=headers)
      r.encoding = r.apparent_encoding
      r.raise_for_status()
      return r.text
  except Exception as error:
      print(error)
if __name__=="__main__":
  # 登录之后的界面user-agent
  headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36",
  }
  # 跟着自己的改变
  Form = {
      "username": "12608199000635",
      "password": "48C032612C2A6777D28A969307B52127E198D59AA78522943C1B283CF7B89E69",
      "nonce": "6BA36BBB1F623279",
      "cnonce": "8257070573EFE28F"
  }
  lin=login()
  # 个人中心的网址
  url="http://www.cqooc.com/my/learn"
  html=get_html(lin,url)
  print(html)

 

3.数据保存为CSV格式和存入数据库

保存为CSV

import  requests
from lxml import etree
import csv
#获取数据
def get_html(url,time=30):
  try:
      r = requests.get(url, timeout=time)
      r.encoding = r.apparent_encoding
      r.raise_for_status()
      return r.text
  except Exception as error:
      print(error)
def parser(html): #解析函数
  doc=etree.HTML(html) #html转换为soup对象
  out_list=[] #解析函数输出数据的列表
  #二次查找法
  for row in  doc.xpath("//*[@class='book-img-text']//li/*[@class='book-mid-info']"):
      row_data=[
          row.xpath("h4/a/text()")[0], #书名
          row.xpath("p[@class='author']/a/text()")[0], #作者
          row.xpath("p[2]/text()")[0].strip(), #介绍
          row.xpath("p[@class='update']/span/text()")[0] #更新日期
      ]
      out_list.append(row_data) #将解析的每行数据插入到输出列表中
  return out_list
def  save_csv(item,path): #数据存储,将list数据写入文件,防止乱码
  with open(path, "a+", newline='',encoding="utf-8") as f: #创建utf8编码文件
      csv_write = csv.writer(f) #创建写入对象
      csv_write.writerows(item) #一次性写入多行
if __name__=="__main__":
  for i in range(1,6):
      url="https://www.qidian.com/rank/fengyun?style=1&page={0}".format(i)
      html=get_html(url) #获取网页数据
      out_list=parser(html) #解析网页,输出列表数据
      save_csv(out_list,"d:\\book.csv") #数据存储

存入数据库

import pymysql
import requests
from lxml import etree
def get_html(url, time=3000):
  try:
      headers ={
          "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"
      }
      r = requests.get(url, timeout=time,headers=headers)
      r.encoding = r.apparent_encoding
      r.raise_for_status()
      return r.text
  except Exception as err:
      print(err)
result = []
def parse_html(html):
  html = etree.HTML(html)
  for row in html.xpath('//*[@id="content"]/div/div[1]/ul/li'):
      Naame = row.xpath("div[2]/h2/a/text()")[0].strip()#//*[@id="content"]/div/div[1]/ul[1]/div[2]/h2/a
      score = row.xpath("div[2]/p[2]/span[2]/text()")[0].strip()#//*[@id="content"]/div/div[1]/ul[1]/div[2]/p[2]/span[2]
      price = row.xpath("div[2]/p[1]/text()")[0].strip().split("/")#//*[@id="content"]/div/div[1]/ul[1]/div[2]/p[1]/text()
      price= price[0]
      content= price[1]
      a=price[2]
      b= price[-1]
      detail = [Naame,score,price,content,a,b]
      result.append(detail)
def join_all(sql_insert,vals,**dbinfo):
  try:
      connet = pymysql.connect(**dbinfo)
      cursor = connet.cursor()
      cursor.executemany(sql_insert,vals)
      connet.commit()
      print('添加成功!')
  except Exception as err:
      print(err)
      connet.rollback()
  cursor.close()
if __name__=="__main__":
  for page in range(1,16):
      url="https://book.douban.com/latest?subcat=%E5%85%A8%E9%83%A8&p={0}".format(str(page))
      parms ={
          "host":"127.0.0.1",
          "port":3306,
          "user":"root",
          "passwd":"123456",
          "db":"db",
          "charset":"utf8"
      }
      html=get_html(url)
      parse_html(html)
  sql_insert = "INSERT INTO db(Naame,score,price,content,a,b)\
                         Values(%s,%s,%s,%s,%s,%s)"
  join_all(sql_insert,result,**parms)
  print(result)

 

总结

本篇文章就到这里了,希望能够给你带来帮助,也希望您能够多多关注编程宝库的更多内容!

Python编程实现简易的音乐播放器基本操作:大家好,今天我们要看看如何用 Python制作音乐播放器。此音乐播放器播放您的歌曲,您可以在播放歌曲时暂停、恢复、设置音量,然后您可以停止音乐。 安装pip install pygam ...