Flipkart-WebScreping/main.py at main · ashudevcodes/Flipkart-WebScreping · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import requests
import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree
import sys

args = sys.argv[1:]

if len(args) > 1:
    searchFor = args[0]
    titleClassName = args[1]
    priceClassName = args[2]
    reviewClassName = args[3]
    prodctLinkClassName = args[4]
    print(titleClassName, priceClassName, reviewClassName, prodctLinkClassName)
else:
    searchFor = input("tell me what are you searching for ? ")

    titleClassName = input(
        '''
    Find the class name of title and past here
    Title ClassName: '''
    )

    priceClassName = input(
        '''
    Find the class name of price and past here
    Price ClassName: '''
    )

    reviewClassName = input(
        '''
    Find span tag with contains rating class name and past here
    Rating ClassName: '''
    )

    prodctLinkClassName = input(
        '''
    Find a tag with contains link of products and past here
    Products ClassName: '''
    )


# fileHandle = open("./temp.html")


def scrapeTitleData(htmlData, className):
    '''Storing the title data in names Array if given className was avelable.

    Args:
         param1 (str): HTML file in a formet of `lxml`
         param2 (str): classname which was use to specify atag class

    '''
    soup = BeautifulSoup(htmlData, "lxml")
    products = soup.find_all(class_=className)

    try:
        _ = products[0]
    except Exception:
        print("No Title data avelable")
        return

    for domeNode in products:
        names.append(domeNode.get_text())


def scrapePriceData(htmlData, className):
    '''this function use for storing prices of all the products.

    Args:
         param1 (str): HTML file in a formet of `lxml`
         param2 (str): classname which was use to specify prices div className

    '''
    soup = BeautifulSoup(htmlData, "lxml")
    products = soup.find_all("div", class_=className)

    try:
        _ = products[0]
    except Exception:
        print("No Price data avelable")
        return

    for divTags in products:
        prices.append(divTags.get_text())


def scrapeReviewsData(htmlData, className):
    '''this function is use for storing review count of a specific products
        in reviews Array.

    Args:
         param1 (str): HTML file in a formet of `lxml`
         param2 (str): classname which was use to specify span class

    '''
    soup = BeautifulSoup(htmlData, "lxml")
    products = soup.find_all(
        "span", class_=className)

    try:
        _ = products[0]
    except Exception:
        print("No Reviews data avelable")
        return

    for spanTags in products:
        reviews.append(spanTags.find("span").find("span").get_text(strip=True))


def scrapeProductlinks(htmlData, className):
    '''function is use for storing products links in links Array.

    Args:
         param1 (str): HTML file in a formet of `lxml`
         param2 (str): classname which was use to specify atag class

    '''
    soup = BeautifulSoup(htmlData, "lxml")
    products = soup.find_all(
        "a", class_=className, href=True)

    try:
        _ = products[0]
    except Exception:
        print("No Link data avelable")
        return

    for spanTags in products:
        links.append(f"https://www.flipkart.com{spanTags['href']}")


reviews = []
names = []
prices = []
links = []

# scrapeTitleData(fileHandle, titleClassName)
# scrapePriceData(fileHandle, priceClassName)
# scrapeReviewsData(fileHandle, reviewClassName)
# scrapeProductlinks(fileHandle, prodctLinkClassName)


for pageNumber in range(1, 2):
    url = f"https://www.flipkart.com/search?q={searchFor}&page={pageNumber}"
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0"
    }
    res = requests.get(url, headers=headers)
    scrapeTitleData(res.text, titleClassName)
    scrapePriceData(res.text, priceClassName)
    scrapeReviewsData(res.text, reviewClassName)
    scrapeProductlinks(res.text, prodctLinkClassName)


# Create a DataFrame to organize the collected data

if not reviews:
    df = pd.DataFrame(list(zip(names, prices, links)),
                      columns=['name', 'price', 'link'])
else:
    df = pd.DataFrame(list(zip(names, prices, reviews, links)),
                      columns=['name', 'price', 'reviews', 'link'])


print(df)

df.to_csv("allProducts.csv")