본문 바로가기

IT 개발 프로그래밍/파이썬

파이썬 연속적인 크롤링하기

 

import requests
from bs4 import BeautifulSoup

class Conversation:
    
    def __init__(self, question, answer):
        self.question=question
        self.answer=answer
        
    def __str__(self):
        return "질문: " +self.question + "\n답변: " + self.answer + "\n"



def get_subjects():
    subjects=[]
    
    req=requests.get('https://basicenglishspeaking.com/daily-english-conversation-topics/')
    html=req.text
    soup=BeautifulSoup(html, 'html.parser')
    
    divs = soup.findAll('div', {"class": "su-column-inner su-clearfix"})
    for div in divs:
        links =div.findAll('a')
        
        for link in links:
            subject=link.text
            subjects.append(subject)
    return subjects


subjects=get_subjects()
print('총 ', len(subjects), '개의 주제를 찾았습니다.')
print(subjects)

 

 

총 75 개의 주제를 찾았습니다. ['Family', 'Restaurant', 'Books', 'Travel', 'Website', 'Accident', 'Childhood memory', 'Favorite rooms', 'Presents', 'Historical place', 'Newspaper/ Magazine', 'A memorable event', 'A favorite subject', 'A museum', 'A favorite movie', 'A foreign country', 'Parties', 'A teacher', 'A friend', 'A hotel', 'A letter', 'Hobbies', 'Music', 'Shopping', 'Holiday', 'Animals', 'A practical skill', 'Sport', 'A School', 'Festival', 'Food', 'Household appliance', 'A music band', 'Weather', 'Neighbor', 'Natural scenery', 'Outdoor activities', 'Law', 'Pollution', 'Traffic jam', 'TV program', 'Architect/ Building', 'Electronic Media', 'Job/ Career', 'Competition/ contest', 'A garden', 'Hometown', 'Clothing', 'Advertisement', 'A project', 'A wedding', 'A Coffee shop', 'Culture', 'Transport', 'Politician', 'Communication', 'Business', 'Computer', 'Exercise', 'Goal/ ambition', 'Art', 'Fashion', 'Jewelry', 'Cosmetic', 'Indoor Game', 'Phone conversation', 'Learning A Second language', 'A\xa0Creative Person', 'A celebrity', 'A Health Problem', 'Technological advancements', 'A Landmark', 'Handcraft Items', 'Plastic Surgery', 'Success']

 

 

 

 

 

conversations =[]
i=1

# 모든 대화 주제 각각에 접근합니다.
for sub in subjects:
    print('(', i, '/', len(subjects), ')', sub)
    # 대화 스크립트를 보여주는 페이지로의 요청(request) 객체를 생성합니다.
    req=requests.get('https://basicenglishspeaking.com/' + sub)
    html=req.text
    soup=BeautifulSoup(html, 'html.parser')
    
    qnas=soup.findAll('div', {"class": "sc_player_container1"})
    

    # 각각의 대화 내용에 모두 접근합니다.
    for qna in qnas:
        if qnas.index(qna)%2==0:
           # 옆으로감 

           q=qna.next_sibling
        else:
            a=qna.next_sibling
            c=Conversation(q, a)
            conversations.append(c)
            
    i=i+1
    
    if i==5:
        break;
    
print('총 ', len(conversations), '개의 대화를 찾았습니다.')

for c in conversations:
    print(str(c))

불러오는 중입니다...