#!/usr/bin/env python3.6
# _*_ coding:utf-8 _*_
import requests
from bs4 import BeautifulSoup
import csv
def get_html(url):
r = requests.get(url)
return r.text
def get_total_pages(html):
soup = BeautifulSoup(html, 'lxml')
pages = soup.find('div', class_='pagination-pages').find_all('a', class_='pagination-page')[-1].get('href')
total_pages = pages.split('=')[1].split('&')[0]
return int(total_pages)
def write_csv(data):
with open('avito.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow((data['title'],
data['price'],
data['address'],
data['time'],
data['url']))
def get_page_data(html):
soup = BeautifulSoup(html, 'lxml')
ads = soup.find('div', class_='catalog-list').find_all('div', class_='item_table')
for ad in ads:
#title, price, address, time, url
try:
title = ad.find('div', class_='description').find('h3').text.strip()
except:
title = ''
try:
url = 'https://www.avito.ru' + ad.find('div', class_='description').find('h3').find('a').get('href')
except:
url = ''
try:
price = ad.find('div', class_='about').text.strip()
except:
price = ''
# try:
# metro =
try:
address = ad.find('p', class_='address').text.strip()
except:
address = ''
try:
time = ad.find('div', class_='data').find('div', class_='date').text.strip()
except:
time = ''
data = {'title': title,
'price': price,
'address': address,
'time': time,
'url': url}
write_csv(data)
def main():
url = 'https://www.avito.ru/novosibirsk/kvartiry/sdam/na_dlitelnyy_srok/1-komnatnye?p=1&i=1&pmax=14000&user=1'
base_url = 'https://www.avito.ru/novosibirsk/kvartiry/sdam/na_dlitelnyy_srok/1-komnatnye?'
page_part = 'p='
query_part = '&i=1&pmax=14000&user=1'
total_pages = get_total_pages(get_html(url))
for i in range(1, total_pages+1):
url_gen = base_url + page_part + str(i) + query_part
#print(url_gan)
html = get_html(url_gen)
get_page_data(html)
if __name__ == '__main__':
main()