I’m on linux and I’ve installed and executed redis
on the background, which is required by aiohttp_scraper. The library didn’t say I had to install redis
myself so maybe it’s missing some others step I have to take before I can use it.
The following code works fine:
import asyncio
from aiohttp_scraper import Proxies
from aiohttp_scraper import ScraperSession as ClientSession
from urllib.request import urlopen
def scrape():
TEST_URL = "https://books.toscrape.com/catalogue/"
urls = [f"{TEST_URL}page-{str(i)}.html" for i in range(1, 5)]
scraper = WebScraper(urls)
asyncio.run(scraper.run())
print(scraper.master_dict)
def get_proxies() -> Proxies:
PROXY_URL = "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt"
proxies = urlopen(PROXY_URL).read().decode('utf-8').splitlines()
return Proxies(
proxies=proxies,
redis_uri="redis://localhost:6379",
window_size_in_minutes=5,
max_requests_per_window=300,
)
class WebScraper(object):
def __init__(self, urls):
self.urls = urls
self.proxies = get_proxies()
self.master_dict = {}
async def run(self):
loop = asyncio.get_event_loop()
async with ClientSession(loop=loop) as session:
tasks = [loop.create_task(self.fetch(session, url)) for url in self.urls]
await asyncio.gather(*tasks)
async def fetch(self, session, url):
async with session.get(url) as response:
print(response.status)
self.master_dict[url] = await response.text()
if __name__ == "__main__":
scrape()
But if I change line 34 to
async with ClientSession(loop=loop, proxies=self.proxies) as session:
then the code hangs every time I execute it. The only thing I see in the output is:
❯ python test.py
Task was destroyed but it is pending!
task: <Task pending name='Task-6' coro=<RedisConnection._read_data() running at /home/user/.local/lib/python3.10/site-packages/aioredis/connection.py:186> wait_for=<Future pending cb=[Task.task_wakeup()]> cb=[RedisConnection.__init__.<locals>.<lambda>() at /home/user/.local/lib/python3.10/site-packages/aioredis/connection.py:168]>
Task was destroyed but it is pending!
task: <Task pending name='Task-7' coro=<RedisConnection._read_data() running at /home/user/.local/lib/python3.10/site-packages/aioredis/connection.py:186> wait_for=<Future pending cb=[Task.task_wakeup()]> cb=[RedisConnection.__init__.<locals>.<lambda>() at /home/user/.local/lib/python3.10/site-packages/aioredis/connection.py:168]>
Task was destroyed but it is pending!
task: <Task pending name='Task-8' coro=<RedisConnection._read_data() running at /home/user/.local/lib/python3.10/site-packages/aioredis/connection.py:186> wait_for=<Future pending cb=[Task.task_wakeup()]> cb=[RedisConnection.__init__.<locals>.<lambda>() at /home/user/.local/lib/python3.10/site-packages/aioredis/connection.py:168]>
You must log in or register to comment.