doc = minidom.parse('book.xml') root = doc.documentElement print(root.nodeName) print(root.getAttribute('name')) books = root.getElementsByTagName("book") for book in books: amounts = book.getElementsByTagName("amount") prices = book.getElementsByTagName("price") print("The book " + book.getAttribute("name") + " have " + amounts[0].childNodes[0].nodeValue + " left.") print("And the price of it is " + prices[0].childNodes[0].nodeValue)
bookstore The First Bookstore The book HarryPotter have 23 left. And the price of it is 100 The book Little Prince have 10 left. And the price of it is 88
element bookstore, attrs: {'name': 'The First Bookstore'} element book, attrs: {'name': 'HarryPotter'} element amount, attrs: {} And amount's value is 23 element amount end. element price, attrs: {} And price's value is 100 element price end. element book end. element book, attrs: {'name': 'Little Prince'} element amount, attrs: {} And amount's value is 10 element amount end. element price, attrs: {} And price's value is 88 element price end. element book end. element bookstore end.
#!/usr/bin/env python3 from html.parser import HTMLParser
classDefaultParser(HTMLParser): defhandle_starttag(self, tag, attrs): if tag == 'html': print('Start parsing.') if tag == 'title': print("Output title:") if tag == 'h1': print("Output h1:") if attrs: print("%s has attr: %s" % (tag, attrs))
defhandle_endtag(self, tag): if tag == 'html': print("End parsing")
defhandle_startendtag(self, tag, attrs): if tag == 'hr': print('Found hr')
defhandle_data(self, data): if data: print(data)
parser = DefaultParser() withopen("index.html", encoding='utf-8') as f: parser.feed(f.read()) parser.close()
这样执行之后获得的结果是这样的:
1 2 3 4 5 6 7 8 9
Start parsing. Output title: 403 Forbidden body has attr: [('bgcolor', 'white')] Output h1: 403 Forbidden Found hr nginx/1.4.6 (Ubuntu) End parsing
import requests from PIL import Image from io import BytesIO
url = "https://www.baidu.com/img/bd_logo1.png" res = requests.get(url, stream=True) withopen("baidu_logo_stream.png", 'wb+') as f: for trunk in res.iter_content(512): f.write(trunk)
<!DOCTYPE html> <htmllang="en"> <head> <metacharset="UTF-8"> <title>Sample File</title> </head> <body> <!--body_start--> <pclass="para">This is a sample file for beautiful soup4</p> <ahref="https://www.baidu.com">Click Here</a><p>And you will see the homepage of baidu <emid="em">BAIDU</em> </p> <p>last para</p>
<!DOCTYPE html> <htmllang="en"> <head> <metacharset="utf-8"/> <title> Sample File </title> </head> <body> <!--body_start--> <pclass="para"> This is a sample file for beautiful soup4 </p> <ahref="https://www.baidu.com"> Click Here </a> <p> And you will see the homepage of baidu <emid="em"> BAIDU </em> </p> <p> last para </p> </body> </html>
import sqlite3 conn = sqlite3.connect("test.db") create_sql = "create table if not exists test(id int primary key not null, name varchar(20) not null)" conn.execute(create_sql) insert_sql = "insert into test values(?, ?)" conn.execute(insert_sql, (1, "test")) conn.execute(insert_sql, (2, "test")) get_sql = "select * from test where name = ?" res = conn.execute(get_sql, ("test",)) for item in res.fetchall(): print(item[1]) conn.close()
res = s.post(login_url, data=formdata, headers=headers) home = s.get("http://tjg.hangowa.com/shop/index.php?act=cart") soup = BeautifulSoup(home.content, "html5lib") goods = soup.select("td > a > img") for good in goods: print(good.get("alt"))
headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36", "Referer": "https://www.bilibili.com/ranking" }
header = { 'Referer': "https://www.bilibili.com/account/dynamic", 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36" }
res = requests.get(url, cookies=cookie, headers=header) result = res.text
classTestSpider(scrapy.Spider): name = "Test" start_urls = ["https://www.julyedu.com/category/index"]
defparse(self, response): for item in response.xpath('//div[@class="course_info_box"]'): title = item.xpath('a/h4/text()').extract_first() desc = item.xpath('a/p[@class="course-info-tip"][1]/text()').extract_first() time = item.xpath('a/p[@class="course-info-tip info-time"]/text()').extract_first() res = { "title": title, "desc": desc, "time": time } yield res
classBlogSpider(scrapy.Spider): name = "cnblog" start_urls = ["https://www.cnblogs.com/pick/#p%s" % p for p inrange(2, 11)]
defparse(self, response): for item in response.xpath('//div[@class="post_item"]'): title = item.xpath('div[@class="post_item_body"]/h3/a/text()').extract_first() recommand = item.xpath('div[@class="digg"]/div/span/text()').extract_first()
➜ scrapy startproject qqnews New Scrapy project 'qqnews', using template directory '/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/templates/project', created in: /.../qqnews
You can start your first spider with: cd qqnews scrapy genspider example example.com ➜ tree qqnews qqnews ├── qqnews │ ├── __init__.py │ ├── __pycache__ │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── __pycache__ └── scrapy.cfg
for link in response.xpath('//div[@class="paginator"]/a/@href').extract(): print(link) yield scrapy.Request(link, callback=self.page_parse, headers=header)
defpage_parse(self, response): for item in response.xpath('//tr[@class="item"]'): book = DoubanBookItem() book['name'] = item.xpath('td[2]/div[1]/a/@title').extract_first() book['rating'] = item.xpath('td[2]/div[2]/span[2]/text()').extract_first() info = item.xpath('td[2]/p[1]/text()').extract_first().split(" / ") book['author'] = info[0] book['price'] = info[-1 ] book['edition_year'] = info[-2] book['publisher'] = info[-3] yield book
from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem from scrapy import Request
classPhotoCrawlerPipeline(ImagesPipeline):
headers = { "cookie":"bid=MsOyVJ-WczQ", "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) \ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" }
defrun_thread(n): for i inrange(100000): lock.acquire() try: change_it(n) finally: lock.release()
速度测试:
1 2 3 4 5 6 7 8 9
➜ time python test.py 52 python test.py 0.06s user 0.04s system 117% cpu 0.081 total ➜ time python test.py 50 python test.py 0.06s user 0.04s system 115% cpu 0.081 total ➜ time python test.py 10 python test.py 0.06s user 0.04s system 117% cpu 0.085 total
在加了锁之后:
1 2 3 4 5 6 7 8 9
➜ time python test.py 0 python test.py 0.16s user 0.16s system 133% cpu 0.237 total ➜ time python test.py 0 python test.py 0.16s user 0.15s system 130% cpu 0.234 total ➜ time python test.py 0 python test.py 0.16s user 0.16s system 131% cpu 0.245 total
@debug defpull(): whileTrue: q.get() print("Get one element. Current size: %d" % q.qsize())
threads = [ threading.Thread(target=push, name="Push_thread"), threading.Thread(target=pull, name="Pull_thread")] for thread in threads: thread.start()
样例输出:
1 2 3 4 5 6 7 8 9
➜ python3 queueTest.py Current thread: Push_thread started. Current thread: Pull_thread started. Get one element. Current size: 0 Get one element. Current size: 0 Get one element. Current size: 3 Get one element. Current size: 2 Get one element. Current size: 1 Get one element. Current size: 0
import threading import requests import json from os import path from datetime import datetime
defdownload(url, name=None, path="/tmp/"): data = requests.get(url).content ifnot name: name = url.split("/")[-1] path = path.join([name]) print("downloading %s" % path) withopen(path, 'wb') as pic: pic.write(data)
defparse(content): data = json.loads(content, encoding="utf-8") imgs = [] for item in data["list"]: sub_item = item["arr"] if"image"in sub_item["type"]: imgs.extend("http://litten.me/ins/%s.jpg" % img for img in sub_item["link"]) for thread in [ threading.Thread(target=download, name="download_thread", args=(img,)) for img in imgs ]: thread.start()