Python专题, 语言

用Python爬取知乎中的文章(附爬取PRL/PRB/Physics Magazine/Nature Physics代码)

说明:本篇仅仅是一些爬虫的练习例子,实际应用不大。

知乎中有很多不错的科研笔记。虽然知乎有关注的功能,但最新的文章经常被淹没在各种点赞和回答中。本篇通过Python爬取自己关注的知乎主页,文章按时间排列,将最新的文章的链接保存至HTML文件中。由于没有写模拟登录知乎的代码,因此每个主页只能爬取到最新的两篇文章。同时代码运行是覆盖之前的数据,所以一个博主的文章始终只是显示最新的两篇。此外,如果主页设置了隐私保护,可能无法爬取。阅读更多文章可直接访问各个博主的知乎主页。爬虫程序可设置为计划任务,每日运行一次自动更新。

爬取知乎的Pyhon代码:

"""
This code is supported by the website: https://www.guanjihuan.com
The newest version of this code is on the web page: https://www.guanjihuan.com/archives/17937
"""

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re  
import datetime

year = datetime.datetime.now().year
month = datetime.datetime.now().month
day = datetime.datetime.now().day

# 获取链接
match_href = []
# 由于没有模拟登录知乎,因此只能爬取到最新的两篇文章
authors = ["https://www.zhihu.com/people/guanjihuan/posts", # Guan
]
for i0 in range(len(authors)):
    start_link = authors[i0]
    html = urlopen(start_link).read().decode('utf-8')  # 打开网页
    soup = BeautifulSoup(html, features='lxml') # 放入soup中
    all_a_tag = soup.find_all('a', href=True)  # 获取超链接标签
    for a_tag in all_a_tag:
        href = a_tag['href']  # 超链接字符串
        if re.search('//zhuanlan.zhihu.com/p/', href) and not re.search('edit', href): # 文章的链接
            if re.search('https:', href)==None:  # 如果链接不是完整的,那么补充完整
                href = 'https:'+ href
            if href not in match_href:
                match_href.append(href)
# 对链接进行排序
numbers = []
match_href_new = [] 
for href in match_href:
    numbers.append(int(href[29:]))
numbers.sort(reverse = True)
for n in numbers:
    match_href_new.append('https://zhuanlan.zhihu.com/p/'+str(n))

# 获取内容并写入文件
f = open('zhihu.html', 'w', encoding='UTF-8') 
f.write('<meta charset="utf-8"><style type="text/css">a{text-decoration: none;color: #0a5794;}a:hover {text-decoration: underline;color: red; }</style>')
f.write('<p>'+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+' 已更新</p>')
for href in match_href_new: 
    try:
        html = urlopen(href).read().decode('utf-8')   # 打开文章链接
        soup = BeautifulSoup(html, features='lxml') # 放入soup中
        title = soup.title   # 文章标题
        f.write('<li><a target=\"_blank\" href=\"')
        f.write(str(href))   # 文章链接
        f.write('\">')
        f.write(str(title.get_text()[:-5]))
        f.write('</a>&nbsp;&nbsp;') 
        author = soup.find("span", {"class": "UserLink AuthorInfo-name"})
        f.write(str(author.get_text()+'&nbsp;&nbsp;'))
        post_time = soup.find("div", {"class" : "ContentItem-time"})
        f.write(str(post_time.get_text()[4:-6])+'</li>')
    except:
        pass
f.close()

附其他爬虫代码(生成HTML网页):

(1)爬取PRL

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re  
import datetime


year = datetime.datetime.now().year
month = datetime.datetime.now().month
day = datetime.datetime.now().day


f = open('prl.html', 'w', encoding='UTF-8') 
f.write('<meta charset="utf-8"><style type="text/css">a{text-decoration: none;color: #0a5794;}a:hover {text-decoration: underline;color: red; }</style>')
f.write('<p>'+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+' 已更新</p>')

match_href = []
for loop in range(1):
    if loop == 0:
        start_link = "https://journals.aps.org/prl/recent"  # 看第一页
    # elif loop == 1:
    #     start_link = "https://journals.aps.org/prl/recent?page=2"  # 看第二页
    html = urlopen(start_link).read().decode('utf-8')  # 打开网页
    soup = BeautifulSoup(html, features='lxml') # 放入soup中
    all_article = soup.find_all('div', {"class":"article panel article-result"})
    for article in all_article:
        all_a_tag = article.find_all('a', href=True)  # 获取超链接标签
        for a_tag in all_a_tag:
            href = a_tag['href']  # 超链接字符串
            if re.search('/abstract/', href): # 文章的链接
                if re.search('https://journals.aps.org', href)==None:  # 如果链接不是完整的,那么补充完整
                    href = 'https://journals.aps.org'+ href
                if href not in match_href and re.search('\?', href)==None:  # 链接不重复
                    match_href.append(href)
                    f.write('<li><a target=\"_blank\" href=\"')
                    f.write(href)   # 文章链接
                    f.write('\">')
                    f.write(a_tag.get_text())
                    f.write('</a>&nbsp;&nbsp;')
        info = article.find('h6', {"class": "pub-info"}).get_text()
        f.write(re.findall('– Published.*', info, re.S)[0][12:]+'</li>')
f.close()

(2)爬取PRB

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re  
import datetime


year = datetime.datetime.now().year
month = datetime.datetime.now().month
day = datetime.datetime.now().day


f = open('prb.html', 'w', encoding='UTF-8') 
f.write('<meta charset="utf-8"><style type="text/css">a{text-decoration: none;color: #0a5794;}a:hover {text-decoration: underline;color: red; }</style>')
f.write('<p>'+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+' 已更新</p>')

match_href = []
for loop in range(1):
    if loop == 0:
        start_link = "https://journals.aps.org/prb/recent"  # 看第一页
    # elif loop == 1:
    #     start_link = "https://journals.aps.org/prb/recent?page=2"  # 看第二页
    html = urlopen(start_link).read().decode('utf-8')  # 打开网页
    soup = BeautifulSoup(html, features='lxml') # 放入soup中
    all_article = soup.find_all('div', {"class":"article panel article-result"}) 
    for article in all_article:
        all_a_tag = article.find_all('a', href=True)  # 获取超链接标签
        for a_tag in all_a_tag:
            href = a_tag['href']  # 超链接字符串
            if re.search('/abstract/', href): # 文章的链接
                if re.search('https://journals.aps.org', href)==None:  # 如果链接不是完整的,那么补充完整
                    href = 'https://journals.aps.org'+ href
                if href not in match_href and re.search('\?', href)==None:  # 链接不重复
                    match_href.append(href)
                    f.write('<li><a target=\"_blank\" href=\"')
                    f.write(href)   # 文章链接
                    f.write('\">')
                    f.write(a_tag.get_text())
                    f.write('</a>&nbsp;&nbsp;')
        info = article.find('h6', {"class": "pub-info"}).get_text()
        f.write(re.findall('– Published .*', info, re.S)[0][12:]+'</li>')
f.close()

(3)爬取physics magazine

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re  
import datetime


year = datetime.datetime.now().year
month = datetime.datetime.now().month
day = datetime.datetime.now().day

f = open('physics_magazine.html', 'w', encoding='UTF-8') 
f.write('<meta charset="utf-8"><style type="text/css">a{text-decoration: none;color: #0a5794;}a:hover {text-decoration: underline;color: red; }</style>')
f.write('<p>'+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+' 已更新</p>')

match_href = []
start_link = "https://physics.aps.org/"
html = urlopen(start_link).read().decode('utf-8')  # 打开网页
soup = BeautifulSoup(html, features='lxml') # 放入soup中
all_articles = soup.find_all('div', {"class":"feed-item-details"})
for article in all_articles:
    all_a_tag = article.find_all('a', href=True)  # 获取超链接标签
    for a_tag in all_a_tag:
        href = a_tag['href']  # 超链接字符串
        if re.search('/articles/', href): # 文章的链接
            if re.search('https://physics.aps.org', href)==None:  # 如果链接不是完整的,那么补充完整
                href = 'https://physics.aps.org'+ href
            if href not in match_href:
                match_href.append(href)
                f.write('<li><a target=\"_blank\" href=\"')
                f.write(href)   # 文章链接
                f.write('\">')
                f.write(a_tag.get_text())
                f.write('</a>&nbsp;&nbsp;')
    time = article.find('time', {"class": "feed-item-date"}).get_text()
    f.write(time+'</li>')
f.close()

(4)爬取nature physics

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re  
import datetime


year = datetime.datetime.now().year
month = datetime.datetime.now().month
day = datetime.datetime.now().day


f = open('nature_physics.html', 'w', encoding='UTF-8') 
f.write('<meta charset="utf-8"><style type="text/css">a{text-decoration: none;color: #0a5794;}a:hover {text-decoration: underline;color: red; }</style>')
f.write('<p>'+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+' 已更新</p>')

match_href = []
start_link = "https://www.nature.com/nphys/research-articles"
html = urlopen(start_link).read().decode('utf-8')  # 打开网页
soup = BeautifulSoup(html, features='lxml') # 放入soup中
all_article = soup.find_all('article', {"class":"u-full-height c-card c-card--flush"}) 
for article in all_article:
    all_a_tag = article.find_all('a', href=True)  # 获取超链接标签
    for a_tag in all_a_tag:
        href = a_tag['href']  # 超链接字符串
        if re.search('/articles/', href): # 文章的链接
            if re.search('https://www.nature.com', href)==None:  # 如果链接不是完整的,那么补充完整
                href = 'https://www.nature.com'+ href
            if href not in match_href and re.search('\?', href)==None:  # 链接不重复
                match_href.append(href)
                f.write('<li><a target=\"_blank\" href=\"')
                f.write(href)   # 文章链接
                f.write('\">')
                f.write(a_tag.get_text())
                f.write('</a>&nbsp;&nbsp;')
    time = article.find('time', {"class": "c-meta__item c-meta__item--block-at-lg"}).get_text()
    f.write(time+'</li>')
f.close()
579 次浏览

【说明:本站主要是个人的一些笔记和代码分享,内容可能会不定期修改。为了使全网显示的始终是最新版本,这里的文章未经同意请勿转载。引用请注明出处:https://www.guanjihuan.com

评论说明:
(1)在保留浏览器缓存的前提下,目前支持72小时自主修改或删除个人评论。如果自己无法修改或删除评论,可再次评论或联系我。如有发现广告留言,请勿点击链接,博主会不定期删除。
(2)评论支持Latex公式。把latexpage作为标签放在任何位置,评论中的公式可正常编译,示例:
$Latex formula$  [latexpage]

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注