|
3 | 3 | import pandas as pd |
4 | 4 | import numpy as np |
5 | 5 | from typing import List, Union |
| 6 | +from selenium.webdriver.common.by import By |
| 7 | +import requests |
| 8 | +from bs4 import BeautifulSoup |
| 9 | +from selenium import webdriver |
| 10 | +import os |
| 11 | +import time |
| 12 | +import io |
| 13 | +from PIL import Image |
6 | 14 |
|
7 | 15 | def balance_binary_target(df, strategy='smote', minority_ratio=None, visualize=False): |
8 | 16 | """ |
@@ -193,3 +201,97 @@ def load_model_zip(zip_file, model_file): |
193 | 201 | model = pickle.load(file) |
194 | 202 |
|
195 | 203 | return model |
| 204 | + |
| 205 | +def image_scrap(url, n:int): |
| 206 | + ''' |
| 207 | + Function to scrap chrome images and get n images we want, and it create a new folder as 'my_images'. |
| 208 | +
|
| 209 | + As we know, we are using selenium, we will need a driver in Chrome. |
| 210 | + Must have driver from Chrome to run it [chrome](https://chromedriver.chromium.org/), file name = 'chromedriver' and dowload in the same path as the scrip or jupyter. |
| 211 | +
|
| 212 | + Parameters |
| 213 | + ---------- |
| 214 | + url -> chrome images web link, must be all way long. |
| 215 | +
|
| 216 | + n -> number of images you want to have in the folder. Must be 'int' |
| 217 | + |
| 218 | + Return |
| 219 | + ---------- |
| 220 | +
|
| 221 | + Folder called 'my_images' with n images, where you can show as much time as you want |
| 222 | + |
| 223 | + ''' |
| 224 | + current_dir = os.getcwd() |
| 225 | + driver_path = os.path.join(current_dir, "chromedriver.exe") |
| 226 | + |
| 227 | + wd = webdriver.Chrome(driver_path) |
| 228 | + |
| 229 | + def get_images_from_google(url, wd, delay, max_images): |
| 230 | + def scroll_down(wd): |
| 231 | + wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") |
| 232 | + time.sleep(delay) |
| 233 | + |
| 234 | + url = url |
| 235 | + wd.get(url) |
| 236 | + |
| 237 | + loadMore = wd.find_element(By.XPATH, '/html/body/c-wiz/div/div/div/div[2]/div/div[3]/div/div/form/div/div/button').click() |
| 238 | + |
| 239 | + image_urls = set() |
| 240 | + skips = 0 |
| 241 | + |
| 242 | + while len(image_urls) + skips < max_images: |
| 243 | + scroll_down(wd) |
| 244 | + |
| 245 | + thumbnails = wd.find_elements(By.CLASS_NAME, "Q4LuWd") |
| 246 | + |
| 247 | + for img in thumbnails[len(image_urls) + skips:max_images]: |
| 248 | + try: |
| 249 | + img.click() |
| 250 | + time.sleep(delay) |
| 251 | + except: |
| 252 | + continue |
| 253 | + |
| 254 | + images = wd.find_elements(By.CLASS_NAME, "n3VNCb") |
| 255 | + for image in images: |
| 256 | + if image.get_attribute('src') in image_urls: |
| 257 | + max_images += 1 |
| 258 | + skips += 1 |
| 259 | + break |
| 260 | + |
| 261 | + if image.get_attribute('src') and 'http' in image.get_attribute('src'): |
| 262 | + image_urls.add(image.get_attribute('src')) |
| 263 | + print(f"Found {len(image_urls)}") |
| 264 | + |
| 265 | + return image_urls |
| 266 | + |
| 267 | + |
| 268 | + def download_image(download_path, url, file_name): |
| 269 | + try: |
| 270 | + image_content = requests.get(url).content |
| 271 | + image_file = io.BytesIO(image_content) |
| 272 | + image = Image.open(image_file) |
| 273 | + file_path = download_path + file_name |
| 274 | + |
| 275 | + with open(file_path, "wb") as f: |
| 276 | + image.save(f, "JPEG") |
| 277 | + |
| 278 | + print("Success") |
| 279 | + except Exception as e: |
| 280 | + print('FAILED -', e) |
| 281 | + |
| 282 | + |
| 283 | + urls = get_images_from_google(url,wd, 1, n) |
| 284 | + |
| 285 | + |
| 286 | + current_dir = os.path.dirname(os.path.abspath(__file__)) |
| 287 | + |
| 288 | + download_dir = os.path.join(current_dir, "my_images") |
| 289 | + |
| 290 | + |
| 291 | + if not os.path.exists(download_dir): |
| 292 | + os.makedirs(download_dir) |
| 293 | + |
| 294 | + for i, url in enumerate(urls): |
| 295 | + download_image(download_dir, url, str(i) + ".jpg") |
| 296 | + |
| 297 | + wd.quit() |
0 commit comments