Содержание

Парсер с использованием пагинации

example 1

Фломатируя url пробежимся по нескольким страницам

import requests
from bs4 import BeautifulSoup
import csv
 
 
 
def get_html(url):
    r = requests.get(url)
    if r.ok: # 200  ## 403 404
        return r.text
    print(r.status_code)
 
 
def refine_cy(s):
    # ТИЦ: 500000 -> ['ТИЦ:', '500000']
    return s.split(' ')[-1]
 
 
def write_csv(data):
    with open('yaca.csv', 'a') as f:
        writer = csv.writer(f)
        writer.writerow((data['name'],
                         data['url'],
                         data['snippet'],
                         data['cy']))
 
 
def get_page_data(html):
    soup = BeautifulSoup(html, 'lxml')
 
    lis = soup.find_all('li', class_='yaca-snippet')
 
    for li in lis:
        try:
            name = li.find('h2').text
        except:
            name = ''
 
        try:
            url = li.find('h2').find('a').get('href')
        except:
            url = ''
 
        try:
            snippet = li.find('div', class_='yaca-snippet__text').text.strip()
        except:
            snippet = ''
 
        try:
            c = li.find('div', class_='yaca-snippet__cy').text.strip()
            cy = refine_cy(c)
        except:
            cy = ''
 
        data = {'name': name,
                'url': url,
                'snippet': snippet,
                'cy': cy}
 
        write_csv(data)
 
 
 
def main():
    pattern = 'https://yandex.ru/yaca/cat/Entertainment/{}.html'
 
    for i in range(0, 5):
        url = pattern.format(str(i))
        get_page_data(get_html(url))
 
 
if __name__ == '__main__':
    main()

example 2

coinmarketcap

import requests
from bs4 import BeautifulSoup
import csv
import re
 
 
def get_html(url):
    r = requests.get(url)
    if r.ok:
        return r.text
    print(r.status_code)
 
 
def write_csv(data):
    with open('cmc.csv', 'a') as f:
        writer = csv.writer(f)
        writer.writerow((data['name'], data['url'], data['price']))
 
def get_page_data(html):
    soup = BeautifulSoup(html, 'lxml')
 
    trs = soup.find_all('tr', class_='cmc-table-row')
    for tr in trs:
        tds = tr.find_all('td')
 
        try:
            name = tds[1].find('a', class_='cmc-link').text.strip()
        except:
            name = ''
 
        try:
            url = 'https://coinmarketcap.com' + tds[1].find('a', class_='cmc-link').get('href')
        except:
            url = ''
        try:
            price = tds[3].find('a').text.strip()
        except:
            price = 'price: null'
 
        data = {'name': name,
                'url': url,
                'price': price}
 
        write_csv(data)
 
def main():
    url = 'https://coinmarketcap.com/'
 
    while True:
        get_page_data(get_html(url))
 
        soup = BeautifulSoup(get_html(url), 'lxml')
        try:
            pattern = 'Next'
            url = 'https://coinmarketcap.com/' + soup.find('div', class_='cmc-button-group').find('a', text=re.compile(pattern)).get('href')
        except:
            break
 
if __name__ == '__main__':
    main()