基于xpath选择器、PyQuery、正则表达式的格式清理工具详解

1,使用xpath清理不必要的标签元素,以及无内容标签

from lxml import etree

def xpath_clean(self, text: str, xpath_dict: dict) -> str:
  '''
  xpath 清除不必要的元素
  :param text: html_content
  :param xpath_dict: 清除目标xpath
  :return: string type html_content
  '''
  remove_by_xpath = xpath_dict if xpath_dict else dict()

  # 必然清除的项目 除非极端情况 一般这些都是要清除的
  remove_by_xpath.update({
    '_remove_2': '//iframe',
    '_remove_4': '//button',
    '_remove_5': '//form',
    '_remove_6': '//input',
    '_remove_7': '//select',
    '_remove_8': '//option',
    '_remove_9': '//textarea',
    '_remove_10': '//figure',
    '_remove_11': '//figcaption',
    '_remove_12': '//frame',
    '_remove_13': '//video',
    '_remove_14': '//script',
    '_remove_15': '//style'
  })

  parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
  selector = etree.HTML(text, parser=parser)

  # 常规删除操作,不需要的标签删除
  for xpath in remove_by_xpath.values():
    for bad in selector.xpath(xpath):
      bad_string = etree.tostring(bad, encoding='utf-8',
                    pretty_print=True).decode()
      logger.debug(f"clean article content : {bad_string}")
      bad.getparent().remove(bad)

  skip_tip = "name()='img' or name()='tr' or " \
        "name()='th' or name()='tbody' or " \
        "name()='thead' or name()='table'"
  # 判断所有p标签,是否有内容存在,没有的直接删除
  for p in selector.xpath(f"//*[not({skip_tip})]"):
    # 跳过逻辑
    if p.xpath(f".//*[{skip_tip}]") or \
        bool(re.sub('\s', '', p.xpath('string(.)'))):
      continue

    bad_p = etree.tostring(p, encoding='utf-8',
                pretty_print=True).decode()
    logger.debug(f"clean p tag : {bad_p}")
    p.getparent().remove(p)

  return etree.tostring(selector, encoding='utf-8',
             pretty_print=True).decode()

2,使用pyquery清理标签属性,并返回处理后源码和纯净文本

#!/usr/bin/env python
# -*-coding:utf-8-*-

from pyquery import PyQuery as pq

def pyquery_clean(self, text, url, pq_dict) -> object:
  '''
  pyquery 做出必要的处理,
  :param text:
  :param url:
  :param pq_dict:
  :return:
  '''
  # 删除pq表达式字典
  remove_by_pq = pq_dict if pq_dict else dict()
  # 标签属性白名单
  attr_white_list = ['rowspan', 'colspan']
  # 图片链接key
  img_key_list = ['src', 'data-echo', 'data-src', 'data-original']
  # 生成pyquery对象
  dom = pq(text)

  # 删除无用标签
  for bad_tag in remove_by_pq.values():
    for bad in dom(bad_tag):
      bad_string = pq(bad).html()
      logger.debug(f"clean article content : {bad_string}")
    dom.remove(bad_tag)

  # 标签各个属性处理
  for tag in dom('*'):
    for key, value in tag.attrib.items():
      # 跳过逻辑,保留表格的rowspan和colspan属性
      if key in attr_white_list:
        continue
      # 处理图片链接,不完整url,补充完整后替换
      if key in img_key_list:
        img_url = self.absolute_url(url, value)
        pq(tag).remove_attr(key)
        pq(tag).attr('src', img_url)
        pq(tag).attr('alt', '')
      # img标签的alt属性保留为空
      elif key == 'alt':
        pq(tag).attr(key, '')
      # 其余所有属性做删除操作
      else:
        pq(tag).remove_attr(key)

  return dom.text(), dom.html()

3,正则表达清理空格以及换行符内容

#!/usr/bin/env python
# -*-coding:utf-8-*-

import re  

def regular_clean(self, str1: str, str2: str):
  '''
  正则表达式处理数据格式
  :param str1: content
  :param str2: html_content
  :return: 返回处理后的结果
  '''

  def new_line(text):
    text = re.sub('<br\s?/?>', '<br>', text)
    text = re.sub(
      '</?a>|</?em>|</?html>|</?body>|'
      '</?head>|<[a-zA-Z]{1,10}\s?/>|'
      '</?strong>|</?blockquote>|</?b>|'
      '</?span>|</?i>|</?hr>|</?font>',
      '',
      text)
    text = re.sub('\n', '', text)
    text = re.sub('<h[1-6]>', '<p>', text)
    text = re.sub('</h[1-6]>', '</p>', text)
    text = text.replace('</p>', '</p>\n').replace('<br>', '<br/>')
    return text

  str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO 处理空白行问题

  # TODO html_content处理 1,删除多余的无法使用的标签以及影响数据展示的标签 2,换行符问题处理以及更换

  str2 = new_line(text=str2)

  return str1, str2

结尾部分,各个方法封装类代码展示

#!/usr/bin/env python
# -*-coding:utf-8-*-
'''
author: szhan
date:2020-08-17
summery: 清理html_conent以及获取纯净数据格式
'''

import re
from lxml import etree
from pyquery import PyQuery as pq
from urllib.parse import urlsplit, urljoin

from loguru import logger


class CleanArticle:

def __init__(
    self,
    text: str,
    url: str = '',
    xpath_dict: dict = None,
    pq_dict: dict = None
):
  self.text = text
  self.url = url
  self.xpath_dict = xpath_dict or dict()
  self.pq_dict = pq_dict or dict()

@staticmethod
def absolute_url(baseurl: str, url: str) -> str:
  '''
  补充url
  :param baseurl:scheme url
  :param url: target url
  :return: complete url
  '''
  target_url = url if urlsplit(url).scheme else urljoin(baseurl, url)
  return target_url

@staticmethod
def clean_blank(text):
  '''
  空白处理
  :param text:
  :return:
  '''
  text = text.replace('&#13;', '').replace('\u3000', '').replace('\t', '').replace('\xa0', '')
  text = re.sub('\s{2,}', '', text)
  text = re.sub('\n{2,}', '\n', text)
  text = text.strip('\n').strip()
  return text

def run(self):
  '''
  :return:处理后的content, html_content
  '''
  if (not bool(self.text)) or (not isinstance(self.text, str)):
    raise ValueError('html_content has a bad type value')
  # 首先,使用xpath去除空格,以及注释,iframe, button, form, script, style, video等标签
  text = self.xpath_clean(self.text, self.xpath_dict)

  # 第二步,使用pyquery处理具体细节方面
  str1, str2 = self.pyquery_clean(text, self.url, self.pq_dict)

  # 最终的正则处理
  content, html_content = self.regular_clean(str1, str2)

  return content, html_content

def xpath_clean(self, text: str, xpath_dict: dict) -> str:
  '''
  xpath 清除不必要的元素
  :param text: html_content
  :param xpath_dict: 清除目标xpath
  :return: string type html_content
  '''
  remove_by_xpath = xpath_dict if xpath_dict else dict()

  # 必然清除的项目 除非极端情况 一般这些都是要清除的
  remove_by_xpath.update({
    '_remove_2': '//iframe',
    '_remove_4': '//button',
    '_remove_5': '//form',
    '_remove_6': '//input',
    '_remove_7': '//select',
    '_remove_8': '//option',
    '_remove_9': '//textarea',
    '_remove_10': '//figure',
    '_remove_11': '//figcaption',
    '_remove_12': '//frame',
    '_remove_13': '//video',
    '_remove_14': '//script',
    '_remove_15': '//style'
  })

  parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
  selector = etree.HTML(text, parser=parser)

  # 常规删除操作,不需要的标签删除
  for xpath in remove_by_xpath.values():
    for bad in selector.xpath(xpath):
      bad_string = etree.tostring(bad, encoding='utf-8',
                    pretty_print=True).decode()
      logger.debug(f"clean article content : {bad_string}")
      bad.getparent().remove(bad)

  skip_tip = "name()='img' or name()='tr' or " \
        "name()='th' or name()='tbody' or " \
        "name()='thead' or name()='table'"
  # 判断所有p标签,是否有内容存在,没有的直接删除
  for p in selector.xpath(f"//*[not({skip_tip})]"):
    # 跳过逻辑
    if p.xpath(f".//*[{skip_tip}]") or \
        bool(re.sub('\s', '', p.xpath('string(.)'))):
      continue

    bad_p = etree.tostring(p, encoding='utf-8',
                pretty_print=True).decode()
    logger.debug(f"clean p tag : {bad_p}")
    p.getparent().remove(p)

  return etree.tostring(selector, encoding='utf-8',
             pretty_print=True).decode()

def pyquery_clean(self, text, url, pq_dict) -> object:
  '''
  pyquery 做出必要的处理,
  :param text:
  :param url:
  :param pq_dict:
  :return:
  '''
  # 删除pq表达式字典
  remove_by_pq = pq_dict if pq_dict else dict()
  # 标签属性白名单
  attr_white_list = ['rowspan', 'colspan']
  # 图片链接key
  img_key_list = ['src', 'data-echo', 'data-src', 'data-original']
  # 生成pyquery对象
  dom = pq(text)

  # 删除无用标签
  for bad_tag in remove_by_pq.values():
    for bad in dom(bad_tag):
      bad_string = pq(bad).html()
      logger.debug(f"clean article content : {bad_string}")
    dom.remove(bad_tag)

  # 标签各个属性处理
  for tag in dom('*'):
    for key, value in tag.attrib.items():
      # 跳过逻辑,保留表格的rowspan和colspan属性
      if key in attr_white_list:
        continue
      # 处理图片链接,不完整url,补充完整后替换
      if key in img_key_list:
        img_url = self.absolute_url(url, value)
        pq(tag).remove_attr(key)
        pq(tag).attr('src', img_url)
        pq(tag).attr('alt', '')
      # img标签的alt属性保留为空
      elif key == 'alt':
        pq(tag).attr(key, '')
      # 其余所有属性做删除操作
      else:
        pq(tag).remove_attr(key)

  return dom.text(), dom.html()

def regular_clean(self, str1: str, str2: str):
  '''
  正则表达式处理数据格式
  :param str1: content
  :param str2: html_content
  :return: 返回处理后的结果
  '''

  def new_line(text):
    text = re.sub('<br\s?/?>', '<br>', text)
    text = re.sub(
      '</?a>|</?em>|</?html>|</?body>|'
      '</?head>|<[a-zA-Z]{1,10}\s?/>|'
      '</?strong>|</?blockquote>|</?b>|'
      '</?span>|</?i>|</?hr>|</?font>',
      '',
      text)
    text = re.sub('\n', '', text)
    text = re.sub('<h[1-6]>', '<p>', text)
    text = re.sub('</h[1-6]>', '</p>', text)
    text = text.replace('</p>', '</p>\n').replace('<br>', '<br/>')
    return text

  str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO 处理空白行问题

  # TODO html_content处理 1,删除多余的无法使用的标签以及影响数据展示的标签 2,换行符问题处理以及更换

  str2 = new_line(text=str2)

  return str1, str2

if __name__ == '__main__':
with open('html_content.html', 'r', encoding='utf-8') as f:
  lines = f.readlines()
  html = ''
  for line in lines:
    html += line
ca = CleanArticle(text=html)
_, html_content = ca.run()
print(html_content)

总结

关于基于xpath选择器、PyQuery、正则表达式的格式清理工具详解的文章就介绍至此,更多相关PyQuery、正则表达式的格式清理工具内容请搜索编程宝库以前的文章,希望大家多多支持编程宝库

这篇文章主要介绍了正则表达式合集和工具any-rule的使用,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友们下面随着小编来一起学习学习吧