WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content

Commit 598dd43

Browse files
committed
Merge from 'Patty-OFurniture:icrawler_master_patches'
Signed-off-by: Zhiyuan Chen <[email protected]>
1 parent 2a27200 commit 598dd43

File tree

4 files changed

+36
-10
lines changed

4 files changed

+36
-10
lines changed

icrawler/builtin/urllist.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ def worker_exec(self, queue_timeout=2, **kwargs):
1010
if self.signal.get("reach_max_num"):
1111
self.logger.info("downloaded image reached max num, thread %s" " exit", threading.current_thread().name)
1212
break
13+
if self.signal.get("exceed_storage_space"):
14+
self.logger.info(
15+
"downloaded image reached max storage space, thread %s" " exit", threading.current_thread().name
16+
)
17+
break
1318
try:
1419
url = self.in_queue.get(timeout=queue_timeout)
1520
except queue.Empty:

icrawler/crawler.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,11 @@ def set_logger(self, log_level=logging.INFO):
8282
def init_signal(self):
8383
"""Init signal
8484
85-
3 signals are added: ``feeder_exited``, ``parser_exited`` and
86-
``reach_max_num``.
85+
4 signals are added: ``feeder_exited``, ``parser_exited``,
86+
``reach_max_num`` and ``exceed_storage_space``.
8787
"""
8888
self.signal = Signal()
89-
self.signal.set(feeder_exited=False, parser_exited=False, reach_max_num=False)
89+
self.signal.set(feeder_exited=False, parser_exited=False, reach_max_num=False, exceed_storage_space=False)
9090

9191
def set_storage(self, storage):
9292
"""Set storage backend for downloader

icrawler/downloader.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import errno
12
import queue
23
import time
34
from io import BytesIO
@@ -114,7 +115,7 @@ def download(self, task, default_ext, timeout=5, max_retry=3, overwrite=False, *
114115
return
115116
self.fetched_num -= 1
116117

117-
while retry > 0 and not self.signal.get("reach_max_num"):
118+
while retry > 0 and not self.signal.get("reach_max_num") and not self.signal.get("exceed_storage_space"):
118119
try:
119120
response = self.session.get(file_url, timeout=timeout)
120121
except Exception as e:
@@ -136,10 +137,19 @@ def download(self, task, default_ext, timeout=5, max_retry=3, overwrite=False, *
136137
with self.lock:
137138
self.fetched_num += 1
138139
filename = self.get_filename(task, default_ext)
139-
self.logger.info("image #%s\t%s", self.fetched_num, file_url)
140-
self.storage.write(filename, response.content)
141-
task["success"] = True
142-
task["filename"] = filename
140+
self.logger.info("image #%s\t%s %s", self.fetched_num, filename, file_url)
141+
142+
task["success"] = False
143+
try:
144+
task["filename"] = filename # may be zero bytes if OSError happened during write()
145+
self.storage.write(filename, response.content)
146+
task["success"] = True
147+
except OSError as o:
148+
# errno.EINVAL -- name too long
149+
if o.errno == errno.ENOSPC:
150+
self.signal.set(exceed_storage_space=True)
151+
else:
152+
raise
143153
break
144154
finally:
145155
retry -= 1

icrawler/parser.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ def worker_exec(self, queue_timeout=2, req_timeout=5, max_retry=3, **kwargs):
6161
"downloaded image reached max num, thread %s " "is ready to exit", current_thread().name
6262
)
6363
break
64+
if self.signal.get("exceed_storage_space"):
65+
self.logger.info("no more storage space, thread %s " "is ready to exit", current_thread().name)
66+
break
6467
# get the page url
6568
try:
6669
url = self.in_queue.get(timeout=queue_timeout)
@@ -90,8 +93,14 @@ def worker_exec(self, queue_timeout=2, req_timeout=5, max_retry=3, **kwargs):
9093
)
9194
else:
9295
self.logger.info(f"parsing result page {url}")
93-
for task in self.parse(response, **kwargs):
94-
while not self.signal.get("reach_max_num"):
96+
task_list = self.parse(response, **kwargs)
97+
if not task_list:
98+
self.logger.debug("self.parse() returned no tasks")
99+
with open("task_list_error.log", "ab") as f:
100+
f.write(response.content)
101+
102+
for task in task_list:
103+
while not self.signal.get("reach_max_num") and not self.signal.get("exceed_storage_space"):
95104
try:
96105
if isinstance(task, dict):
97106
self.output(task, timeout=1)
@@ -110,6 +119,8 @@ def worker_exec(self, queue_timeout=2, req_timeout=5, max_retry=3, **kwargs):
110119
break
111120
if self.signal.get("reach_max_num"):
112121
break
122+
if self.signal.get("exceed_storage_space"):
123+
break
113124
self.in_queue.task_done()
114125
break
115126
finally:

0 commit comments

Comments
 (0)