一个用于抓取u148.net的python脚本

自己喜欢看一些有意思的文摘内容,但是常常要在多个网页之间转来换去,所以想是否有一种方法能把所关注的内容整合到一个网页内呢?有,需要自己去实现,所以才有了这个python版的抓取脚本。

#coding=utf-8
import urllib
import urllib2
import re
import time
import datetime
import string
from BeautifulSoup import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def getContentFunc(webUrl):
    starttime = datetime.datetime.now()
    page = urllib2.urlopen(webUrl)
    soup = BeautifulSoup(page)
    #print(soup.prettify())
    #contentTitle=soup.html.head.title.string
    #print(contentTitle)
    #print(soup.findAll('div', attrs={'class':'148content'}))
    print(soup.html.head.title.string)
    #print(str(totalPageNum)+" : "+datetime.datetime.now())
    contentTemp = BeautifulSoup(str(soup.find('div',attrs={'class':'u148content'})))
    #获得标题
    #print(str(totalPageNum)+" : "+datetime.datetime.now())
    contentTitle=contentTemp.find('h1')
    #print(str(totalPageNum)+" : "+datetime.datetime.now())
    #print(contentTitle)
    #获得内容
    contentPaper=contentTemp.find('div',attrs={'class':'content'})
    endtime = datetime.datetime.now()
    print((endtime - starttime).seconds)
    #totalPageNum +=1
    return str(contentTitle)+str(contentPaper)

def getPageFunc(webUrl):
    page = urllib2.urlopen(webUrl)
    soup = BeautifulSoup(page)
    contentTemp = BeautifulSoup(str(soup.find('div',attrs={'class':'u148content'})))
    allTitleList = contentTemp.findAll('div', attrs={'class':'mainlist'})
    totalPageContent = str('')
    for tempTitle in allTitleList:
        #获得一个单页面的标题链接,h1加粗字体
        contentPageTitle = tempTitle.find('h1')
        #在a属性里查找
        contentPageTitle1 = contentPageTitle.findAll('a')
        pageValueCmp = contentPageTitle1[0].string
        #if cmp(pageValueCmp, u"[图画]") != 0 and cmp(pageValueCmp,u"[文字]") != 0 :
            #continue
        if cmp(pageValueCmp, u"[图画]") == 0 :
            print(pageValueCmp)
        elif cmp(pageValueCmp,u"[文字]") == 0 :
            print(pageValueCmp)
        else :
            continue
        #print(pageValueCmp)
        #print(BeautifulSoup(str(contentPageTitle1[0])).prettify())
        #在a属性中的第二个数组里
        contentPageTitle2 = str(contentPageTitle1[1])
        findHead=9
        findTail=contentPageTitle2.find("target")
        #print(findTail)
        contentPageUrl = "http://www.u148.net" + contentPageTitle2[findHead : findTail - 2]
        totalPageContent += getContentFunc(contentPageUrl) + "<h2>++++++++++++++++++++++++</h2><br/>"
    return totalPageContent
        #contentPageUrl=str(contentPageTitle1[1])
        #writeTitleTofile += str(tempTitle)
        #print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")



if __name__ == "__main__":
    #print(writeTitleTofile)
    #totalPageNum = int(0)
    getPageNum = int(1)
    if len(sys.argv)<2:
        print(u'默认下载首页内容,如果需要下载多页,请跟参数n')
    else:
        getPageNum = int(sys.argv[1])
    allPage = getPageFunc("http://www.u148.net/list.html")
    if getPageNum > 1:
        for i in range(2,getPageNum+1):
            tempUrl= "http://www.u148.net/list/"+str(i)+".html"
            print(tempUrl)
            allPage += getPageFunc(tempUrl)

    webHead = str(u'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n' + '<html xmlns="http://www.w3.org/1999/xhtml">\n') + u"<head>\n<title>\n" + u"U148文章集合" + u"</title>\n</head>\n<body>\n"
    webTail = u"</body>\n</html>\n"
    aHTML=webHead + allPage + webTail
    currentDate = time.strftime('%Y-%m-%d',time.localtime(time.time()))
    wfile = open(str(currentDate+"-u148.net.html"),'wt')
    wfile.writelines(BeautifulSoup(aHTML).prettify())
    wfile.close()


    #for incident in soup('td',width="90%"):
        #where, linebreak, what = incident.contents[ :3 ]
        #print(where.strip())
        #print(what.strip())
        #print("\n")
收藏与分享

发表评论

电子邮件地址不会被公开。 必填项已用*标注