I want to scrape a website asynchronously using a list of tor circuits with different exit nodes and making sure each exit node only makes a request every 5 seconds.
For testing purposes, I’m using the website https://books.toscrape.com/ and I’m lowering the sleep time, number of circuits and number of pages to scrape.
It works fine without tor, but I’m getting the following error when I use tor.:
2022-09-06 11:08:49,380 [DEBUG] Loaded 10 authorities dir
2022-09-06 11:08:49,383 [DEBUG] Loaded 141 fallbacks dir
2022-09-06 11:08:49,383 [DEBUG] Using selector: EpollSelector
2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
{}
import asyncio
import aiohttp
import logging
from docopt import docopt
from torpy import TorClient
from typing import Dict, List
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("debug.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def main():
"""
Usage:
scraper.py <url>... [--tor]
scraper.py -h | --help
Options:
-h --help Show this screen.
--tor Use tor to scrape website
"""
args = docopt(main.__doc__)
urls = args['<url>']
tor = args['--tor']
scrape_website(urls, tor)
def scrape_test_website() -> None:
TEST_URL = "https://books.toscrape.com/catalogue/"
urls = [f"{TEST_URL}page-{str(i)}.html" for i in range(1, 5)]
print(scrape_website(urls, tor=True))
def scrape_website(urls: List[str], tor: bool = False) -> Dict:
if tor:
scraper = TorWebScraper(urls)
else:
scraper = WebScraper(urls)
asyncio.run(scraper.run())
return scraper.master_dict
class WebScraper(object):
def __init__(self, urls: List[str]):
self.urls = urls
self.all_data = []
self.master_dict = {}
async def fetch(self, url: str) -> str:
try:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
text = await response.text()
return url, text
except Exception as e:
logger.error(e)
async def run(self) -> None:
tasks = []
for url in self.urls:
tasks.append(self.fetch(url))
self.all_data = await asyncio.gather(*tasks)
for data in self.all_data:
if data is not None:
url = data[0]
self.master_dict[url] = {'raw_html': data[1]}
def get_circuits(n: int = 2) -> List:
"""
Get a list of one-hop tor circuits with different nodes
"""
circuits = []
with TorClient() as tor:
for _ in range(n):
circuits.append(tor.create_circuit())
return circuits
class TorWebScraper(WebScraper):
def __init__(self, urls: List[str]):
super().__init__(urls)
self.circuits = get_circuits(2)
async def fetch(self, url: str) -> str:
try:
async with aiohttp.ClientSession() as session:
for circuit in self.circuits:
async with circuit.create_stream() as stream:
async with session.get(url, proxy=stream.proxy) as response:
await asyncio.sleep(20e-3)
text = await response.text()
return url, text
except Exception as e:
logger.error(e)
if __name__ == '__main__':
#main()
scrape_test_website()
You must log in or register to comment.