web scraping with python 代码笔记/code notes part2

整理《web scraping with python》书中代码笔记

Cleaning Your Dirty Data

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
def cleanInput(input):
input = re.sub('\n+', " ", input)
input = re.sub('\[[0-9]*\]', "", input)
input = re.sub(' +', " ", input)
input = bytes(input, "UTF-8")
input = input.decode("ascii", "ignore")
cleanInput = []
input = input.split(' ')
for item in input:
item = item.strip(string.punctuation)
if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
cleanInput.append(item)
return cleanInput
def ngrams(input, n):
input = cleanInput(input)
output = []
for i in range(len(input)-n+1):
output.append(input[i:i+n])
return output

Reading and Writing Natural Languages

Summarizing Data

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator
def cleanInput(input):
input = re.sub('\n+', " ", input).lower()
input = re.sub('\[[0-9]*\]', "", input)
input = re.sub(' +', " ", input)
input = bytes(input, "UTF-8")
input = input.decode("ascii", "ignore")
cleanInput = []
input = input.split(' ')
for item in input:
item = item.strip(string.punctuation)
if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
cleanInput.append(item)
return cleanInput
def ngrams(input, n):
input = cleanInput(input)
output = {}
for i in range(len(input)-n+1):
ngramTemp = " ".join(input[i:i+n])
if ngramTemp not in output:
output[ngramTemp] = 0
output[ngramTemp] += 1
return output
content = str(
urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(),
'utf-8')
ngrams = ngrams(content, 2)
sortedNGrams = sorted(ngrams.items(), key = operator.itemgetter(1), reverse=True)
print(sortedNGrams)

Markov Models

from urllib.request import urlopen
from random import randint
def wordListSum(wordList):
sum = 0
for word, value in wordList.items():
sum += value
return sum
def retrieveRandomWord(wordList):
randIndex = randint(1, wordListSum(wordList))
for word, value in wordList.items():
randIndex -= value
if randIndex <= 0:
return word
def buildWordDict(text):
#Remove newlines and quotes
text = text.replace("\n", " ");
text = text.replace("\"", "");
#Make sure punctuation marks are treated as their own "words,"
#so that they will be included in the Markov chain
punctuation = [',','.',';',':']
for symbol in punctuation:
text = text.replace(symbol, " "+symbol+" ");
words = text.split(" ")
#Filter out empty words
words = [word for word in words if word != ""]
wordDict = {}
for i in range(1, len(words)):
if words[i-1] not in wordDict:
#Create a new dictionary for this word
wordDict[words[i-1]] = {}
if words[i] not in wordDict[words[i-1]]:
wordDict[words[i-1]][words[i]] = 0
wordDict[words[i-1]][words[i]] = wordDict[words[i-1]][words[
i]] + 1
return wordDict
text = str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt")
.read(), 'utf-8')
wordDict = buildWordDict(text)
#Generate a Markov chain of length 100
length = 100
chain = ""
currentWord = "I"
for i in range(0, length):
chain += currentWord+" "
currentWord = retrieveRandomWord(wordDict[currentWord])
print(chain)

natural language toolkit

from nltk.book import *
from nltk import ngrams
fourgrams = ngrams(text6, 4)
for fourgram in fourgrams:
if fourgram[0] == "coconut":
print(fourgram)

Crawling Through Forms and Logins

import requests
params = {'firstname': 'Ryan', 'lastname': 'Mitchell'}
r = requests.post("http://pythonscraping.com/files/processing.php", data=params)
print(r.text)
# second example
import requests
files = {'uploadFile': open('../files/Python-logo.png', 'rb')}
r = requests.post("http://pythonscraping.com/pages/processing2.php",
files=files)
print(r.text)
# third example
import requests
session = requests.Session()
params = {'username': 'username', 'password': 'password'}
s = session.post("http://pythonscraping.com/pages/cookies/welcome.php", params)
print("Cookie is set to:")
print(s.cookies.get_dict())
print("-----------")
print("Going to profile page...")
s = session.get("http://pythonscraping.com/pages/cookies/profile.php")
print(s.text)
# handle HTTP authentication:
import requests
from requests.auth import AuthBase
from requests.auth import HTTPBasicAuth
auth = HTTPBasicAuth('ryan', 'password')
r = requests.post(url="http://pythonscraping.com/pages/auth/login.php", auth=auth)
print(r.text)

Scraping JavaScript

Executing JavaScript in Python with Selenium

from selenium import webdriver
import time
driver = webdriver.PhantomJS(executable_path='')
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
time.sleep(3)
print(driver.find_element_by_id("content").text)
driver.close()
# second 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.PhantomJS(executable_path='')
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "loadedButton")))
finally:
print(driver.find_element_by_id("content").text)
driver.close()

Handling Redirects

from selenium import webdriver
import time
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import StaleElementReferenceException
def waitForLoad(driver):
elem = driver.find_element_by_tag_name("html")
count = 0
while True:
count += 1
if count > 20:
print("Timing out after 10 seconds and returning")
return
time.sleep(.5)
try:
elem == driver.find_element_by_tag_name("html")
except StaleElementReferenceException:
return
driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
waitForLoad(driver)
print(driver.page_source)

Image Processing and Text Recognition

library

#Pillow
from PIL import Image, ImageFilter
kitten = Image.open("kitten.jpg")
blurryKitten = kitten.filter(ImageFilter.GaussianBlur)
blurryKitten.save("kitten_blurred.jpg")
blurryKitten.show()

Processing Well-Formatted Text

from PIL import Image
import subprocess
def cleanFile(filePath, newFilePath):
image = Image.open(filePath)
#Set a threshold value for the image, and save
image = image.point(lambda x: 0 if x<143 else 255)
image.save(newFilePath)
#call tesseract to do OCR on the newly created image
subprocess.call(["tesseract", newFilePath, "output"])
#Open and read the resulting data file
outputFile = open("output.txt", 'r')
print(outputFile.read())
outputFile.close()
cleanFile("text_2.png", "text_2_clean.png")
# Scraping Text from Images on Websites
import time
from urllib.request import urlretrieve
import subprocess
from selenium import webdriver
#Create new Selenium driver
driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
#Sometimes, I've found that PhantomJS has problems finding elements on this
#page that Firefox does not. If this is the case when you run this,
#try using a Firefox browser with Selenium by uncommenting this line:
#driver = webdriver.Firefox()
driver.get(
"http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200")
time.sleep(2)
#Click on the book preview button
driver.find_element_by_id("sitbLogoImg").click()
imageList = set()
#Wait for the page to load
time.sleep(5)
#While the right arrow is available for clicking, turn through pages
while "pointer" in driver.find_element_by_id("sitbReaderRightPageTurner")
.get_attribute("style"):
driver.find_element_by_id("sitbReaderRightPageTurner").click()
time.sleep(2)
#Get any new pages that have loaded (multiple pages can load at once,
#but duplicates will not be added to a set)
pages = driver.find_elements_by_xpath("//div[@class='pageImage']/div/img")
for page in pages:
image = page.get_attribute("src")
imageList.add(image)
driver.quit()
#Start processing the images we've collected URLs for with Tesseract
for image in sorted(imageList):
urlretrieve(image, "page.jpg")
p = subprocess.Popen(["tesseract", "page.jpg", "page"],
stdout=subprocess.PIPE,stderr=subprocess.PIPE)
p.wait()
f = open("page.txt", "r")
print(f.read())

Retrieving CAPTCHAs and Submitting Solutions

from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import subprocess
import requests
from PIL import Image
from PIL import ImageOps
def cleanImage(imagePath):
image = Image.open(imagePath)
image = image.point(lambda x: 0 if x<143 else 255)
borderImage = ImageOps.expand(image,border=20,fill='white')
borderImage.save(imagePath)
html = urlopen("http://www.pythonscraping.com/humans-only")
bsObj = BeautifulSoup(html)
#Gather prepopulated form values
imageLocation = bsObj.find("img", {"title": "Image CAPTCHA"})["src"]
formBuildId = bsObj.find("input", {"name":"form_build_id"})["value"]
captchaSid = bsObj.find("input", {"name":"captcha_sid"})["value"]
captchaToken = bsObj.find("input", {"name":"captcha_token"})["value"]
captchaUrl = "http://pythonscraping.com"+imageLocation
urlretrieve(captchaUrl, "captcha.jpg")
cleanImage("captcha.jpg")
p = subprocess.Popen(["tesseract", "captcha.jpg", "captcha"], stdout=
subprocess.PIPE,stderr=subprocess.PIPE)
p.wait()
f = open("captcha.txt", "r")
#Clean any whitespace characters
captchaResponse = f.read().replace(" ", "").replace("\n", "")
print("Captcha solution attempt: "+captchaResponse)
if len(captchaResponse) == 5:
params = {"captcha_token":captchaToken, "captcha_sid":captchaSid,
"form_id":"comment_node_page_form", "form_build_id": formBuildId,
"captcha_response":captchaResponse, "name":"Ryan Mitchell",
"subject": "I come to seek the Grail",
"comment_body[und][0][value]":
"...and I am definitely not a bot"}
r = requests.post("http://www.pythonscraping.com/comment/reply/10",
data=params)
responseObj = BeautifulSoup(r.text)
if responseObj.find("div", {"class":"messages"}) is not None:
print(responseObj.find("div", {"class":"messages"}).get_text())
else:
print("There was a problem reading the CAPTCHA correctly!")

PySocks

import socks
import socket
from urllib.request import urlopen
socks.set_default_proxy(socks.SOCKS5, "localhost", 9150)
socket.socket = socks.socksocket
print(urlopen('http://icanhazip.com').read())
from selenium import webdriver
service_args = [ '--proxy=localhost:9150', '--proxy-type=socks5', ]
driver = webdriver.PhantomJS(executable_path='<path to PhantomJS>',
service_args=service_args)
driver.get("http://icanhazip.com")
print(driver.page_source)
driver.close()

Testing Your Website with Scrapers

Testing Wikipedia

from urllib.request import urlopen
from bs4 import BeautifulSoup
import unittest
class TestWikipedia(unittest.TestCase):
bsObj = None
def setUpClass():
global bsObj
url = "http://en.wikipedia.org/wiki/Monty_Python"
bsObj = BeautifulSoup(urlopen(url))
def test_titleText(self):
global bsObj
pageTitle = bsObj.find("h1").get_text()
self.assertEqual("Monty Python", pageTitle);
def test_contentExists(self):
global bsObj
content = bsObj.find("div",{"id":"mw-content-text"})
self.assertIsNotNone(content)
if __name__ == '__main__':
unittest.main()

Interacting with the Site

from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
driver.get("http://pythonscraping.com/pages/files/form.html")
firstnameField = driver.find_element_by_name("firstname")
lastnameField = driver.find_element_by_name("lastname")
submitButton = driver.find_element_by_id("submit")
### METHOD 1 ###
firstnameField.send_keys("Ryan")
lastnameField.send_keys("Mitchell")
submitButton.click()
################
### METHOD 2 ###
actions = ActionChains(driver).click(firstnameField).send_keys("Ryan")
.click(lastnameField).send_keys("Mitchell")
.send_keys(Keys.RETURN)
actions.perform()
################
print(driver.find_element_by_tag_name("body").text)
driver.close()
# Drag and drop
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver import ActionChains
driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
driver.get('http://pythonscraping.com/pages/javascript/draggableDemo.html')
print(driver.find_element_by_id("message").text)
element = driver.find_element_by_id("draggable")
target = driver.find_element_by_id("div2")
actions = ActionChains(driver)
actions.drag_and_drop(element, target).perform()
print(driver.find_element_by_id("message").text) 
# Unittest or Selenium
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver import ActionChains
import unittest
class TestAddition(unittest.TestCase):
driver = None
def setUp(self):
global driver
driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
url = 'http://pythonscraping.com/pages/javascript/draggableDemo.html'
driver.get(url)
def tearDown(self):
print("Tearing down the test")
def test_drag(self):
global driver
element = driver.find_element_by_id("draggable")
target = driver.find_element_by_id("div2")
actions = ActionChains(driver)
actions.drag_and_drop(element, target).perform()
self.assertEqual("You are definitely
not a bot!", driver.find_element_by_id(
"message").text)
if __name__ == '__main__':
unittest.main()
# Taking screenshots
driver = webdriver.PhantomJS()
driver.get('http://www.pythonscraping.com/')
driver.get_screenshot_as_file('tmp/pythonscraping.png')

Handling Cookies

from selenium import webdriver
driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
driver.get("http://pythonscraping.com")
driver.implicitly_wait(1)
print(driver.get_cookies())
# 
from selenium import webdriver
driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
driver.get("http://pythonscraping.com")
driver.implicitly_wait(1)
print(driver.get_cookies())
savedCookies = driver.get_cookies()
driver2 = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
driver2.get("http://pythonscraping.com")
driver2.delete_all_cookies()
for cookie in savedCookies:
driver2.add_cookie(cookie)
driver2.get("http://pythonscraping.com")
driver.implicitly_wait(1)
print(driver2.get_cookies())
# Hidden Input Field Values
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
driver = webdriver.PhantomJS(executable_path='')
driver.get("http://pythonscraping.com/pages/itsatrap.html")
links = driver.find_elements_by_tag_name("a")
for link in links:
if not link.is_displayed():
print("The link "+link.get_attribute("href")+" is a trap")
fields = driver.find_elements_by_tag_name("input")
for field in fields:
if not field.is_displayed():
print("Do not change value of "+field.get_attribute("name"))