[ 기타 활동 ]/파이썬

스크롤러

유니시티황 2018. 5. 15. 03:48

from urllib.request import urlretrieve

import urllib.parse

from urllib.parse import urlencode, urlparse, parse_qs

import webbrowser

from bs4 import BeautifulSoup

import requests


address = 'https://www.google.co.kr/search?num=10&ie=UTF-8&q=site:pdf.th7.cn/down/files+arduino+pdf'

# Default Google search address start


file = open( "OCR.txt", "rt" )

# Open text document that contains the question


word = file.read()

file.close()


myList = [item for item in word.split('\n')]

newString = ' '.join(myList)

# The question is on multiple lines so this joins them together with proper spacing


#print(newString)


qstr = urllib.parse.quote_plus(newString)

# Encode the string


newWord = address + qstr


# Combine the base and the encoded query


# print(newWord)


source = requests.get(address)


html = source.text


soup = BeautifulSoup(html, 'lxml')


for tag in soup.select('h3[class=r]'):

    target = tag.a['href']

    target= target.lstrip('/url?q=')

    target= target.split('&')[0]

    target= target.replace("%2520", " ")

    print(target)