Beautiful Soup - Unable To Create Csv And Text Files After Scraping
Solution 1:
Solution using html parser of lxml.
There are 361 pages and on each page we have 12 links. We can iterate to each page and extract the links using xpath.
xpath helps in getting:
- Text under a particular tag
Value of particular tag (here: value of 'href' attribute of 'a' tag)
import csv from lxml import html from time import sleep import requests from random import randint outputFile = open("All_links.csv", r'wb') fileWriter = csv.writer(outputFile) fileWriter.writerow(["Sl. No.", "Page Number", "Link"]) url1 = 'https://www.marketingweek.com/page/' url2 = '/?s=big+data' sl_no = 1#iterating from 1st page through 361th pagefor i in xrange(1, 362): #generating final url to be scraped using page number url = url1 + str(i) + url2 #Fetching page response = requests.get(url) sleep(randint(10, 20)) #using html parser htmlContent = html.fromstring(response.content) #Capturing all 'a' tags under h2 tag with class 'hentry-title entry-title' page_links = htmlContent.xpath('//div[@class = "archive-constraint"]//h2[@class = "hentry-title entry-title"]/a/@href') for page_link in page_links: fileWriter.writerow([sl_no, i, page_link]) sl_no += 1
Solution 2:
I only healed url collection part:
import requests
from bs4 import BeautifulSoup
next_button = 'https://www.marketingweek.com/page/1/?s=big+data'
res = []
while1:
response = requests.get(next_button)
soup = BeautifulSoup(response.text, "lxml")
search_results = soup.find('div', class_='archive-constraint') #localizing search window with article links
article_link_tags = search_results.findAll('a') #ordinary scheme goes further #added duplication drop from list of urls
row = [url['href'] for url in article_link_tags]
row = list(set(row))
res.append(row)
#Automatically clicks next button to load other articles
next_button = soup.find('a', class_='next page-numbers')
#Searches for articles till Next button is not foundifnot next_button:
break
next_button = next_button['href']
for i in res:
for j in i:
print(j)
EDIT: First version of code faced parser bot blocking on site. These fixes below might help (for 100% assurance you need to know how bots blocker works on the site). I added both random sleep
from 5 to 30 sec and random user-agent
change (only desktop user-agents) in requests to site to skip blocking:
import numpy as np
import time
import requests
from bs4 import BeautifulSoup
user_agents = {0:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246',
1:
'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
2:
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9',
3:
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36',
4:
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1'}
next_button = 'https://www.marketingweek.com/page/1/?s=big+data'
res = []
while1:
response = requests.get(next_button, headers={'User-Agent':user_agents[np.random.randint(0,5)]})
soup = BeautifulSoup(response.text, "lxml")
search_results = soup.find('div', class_='columns-flex full-ads') #localizing search window with article links
article_link_tags = search_results.findAll('a') #ordinary scheme goes further #added duplication drop from list of urls
row = [url['href'] for url in article_link_tags]
row = list(set(row))
res.append(row)
#Automatically clicks next button to load other articles
next_button = soup.find('a', class_='next page-numbers')
#Searches for articles till Next button is not foundifnot next_button:
break
next_button = next_button['href']
time.sleep((30 - 5) * np.random.random() + 5)
for i in res:
for j in i:
print(j)
If it still fails after some time, try to extend sleep
twice or bigger.
Solution 3:
I tried to print next_button
, it is None
. Why we need the a
tag of next button if we can directly go to specific page by slightly modifying the url. In your code, suppose either way you have found the link to next page but didn't get the text
of the element returned by soup.find('a',text=">>")
. Here is your modified code. Let me know if it works for you.
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import urllib2
df = pd.DataFrame()
base_url = 'https://www.marketingweek.com/?s=big+data'
res = []
page = 1while page < 362:
page_url = 'https://www.marketingweek.com/page/'+str(page)+'/?s=big+data'
response = requests.get(page_url)
soup = BeautifulSoup(response.content, "html.parser")
search_results = soup.find('div', class_='columns-flex full-ads') #localizing search window with article linkstry:
article_link_tags = search_results.findAll('a') #ordinary scheme goes further except AttributeError:
continue
res.append([url['href'] for url in article_link_tags])
print('Found {} links on page {} '.format(len(res[page-1]), page))
df.append(res[page-1])
#for i in res:# for j in i:# print(j)
page += 1# break####Storing scraped links in csv file###
df.to_csv('SampleUrl.csv', index=False)
Post a Comment for "Beautiful Soup - Unable To Create Csv And Text Files After Scraping"