#!/usr/bin/env python # Copyright (c) 2009-2020, Mario Vilas # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice,this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of the copyright holder nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import os import random import sys import time import ssl if sys.version_info[0] > 2: from http.cookiejar import LWPCookieJar from urllib.request import Request, urlopen from urllib.parse import quote_plus, urlparse, parse_qs else: from cookielib import LWPCookieJar from urllib import quote_plus from urllib2 import Request, urlopen from urlparse import urlparse, parse_qs try: from bs4 import BeautifulSoup is_bs4 = True except ImportError: from BeautifulSoup import BeautifulSoup is_bs4 = False __all__ = [ # Main search function. 'search', # Shortcut for "get lucky" search. 'lucky', # Miscellaneous utility functions. 'get_random_user_agent', 'get_tbs', ] # URL templates to make Google searches. url_home = "https://www.google.%(tld)s/" url_search = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \ "btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \ "cr=%(country)s" url_next_page = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \ "start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&" \ "cr=%(country)s" url_search_num = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \ "num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \ "cr=%(country)s" url_next_page_num = "https://www.google.%(tld)s/search?hl=%(lang)s&" \ "q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \ "safe=%(safe)s&cr=%(country)s" url_parameters = ( 'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'cr') # Cookie jar. Stored at the user's home folder. # If the cookie jar is inaccessible, the errors are ignored. home_folder = os.getenv('HOME') if not home_folder: home_folder = os.getenv('USERHOME') if not home_folder: home_folder = '.' # Use the current folder on error. cookie_jar = LWPCookieJar(os.path.join(home_folder, '.google-cookie')) try: cookie_jar.load() except Exception: pass # Default user agent, unless instructed by the user to change it. USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)' # Load the list of valid user agents from the install folder. # The search order is: # * user_agents.txt.gz # * user_agents.txt # * default user agent try: install_folder = os.path.abspath(os.path.split(__file__)[0]) try: user_agents_file = os.path.join(install_folder, 'user_agents.txt.gz') import gzip fp = gzip.open(user_agents_file, 'rb') try: user_agents_list = [_.strip() for _ in fp.readlines()] finally: fp.close() del fp except Exception: user_agents_file = os.path.join(install_folder, 'user_agents.txt') with open(user_agents_file) as fp: user_agents_list = [_.strip() for _ in fp.readlines()] except Exception: user_agents_list = [USER_AGENT] # Get a random user agent. def get_random_user_agent(): """ Get a random user agent string. :rtype: str :return: Random user agent string. """ return random.choice(user_agents_list) # Helper function to format the tbs parameter. def get_tbs(from_date, to_date): """ Helper function to format the tbs parameter. :param datetime.date from_date: Python date object. :param datetime.date to_date: Python date object. :rtype: str :return: Dates encoded in tbs format. """ from_date = from_date.strftime('%m/%d/%Y') to_date = to_date.strftime('%m/%d/%Y') return 'cdr:1,cd_min:%(from_date)s,cd_max:%(to_date)s' % vars() # Request the given URL and return the response page, using the cookie jar. # If the cookie jar is inaccessible, the errors are ignored. def get_page(url, user_agent=None, verify_ssl=True): """ Request the given URL and return the response page, using the cookie jar. :param str url: URL to retrieve. :param str user_agent: User agent for the HTTP requests. Use None for the default. :param bool verify_ssl: Verify the SSL certificate to prevent traffic interception attacks. Defaults to True. :rtype: str :return: Web page retrieved for the given URL. :raises IOError: An exception is raised on error. :raises urllib2.URLError: An exception is raised on error. :raises urllib2.HTTPError: An exception is raised on error. """ if user_agent is None: user_agent = USER_AGENT request = Request(url) request.add_header('User-Agent', user_agent) cookie_jar.add_cookie_header(request) if verify_ssl: response = urlopen(request) else: context = ssl._create_unverified_context() response = urlopen(request, context=context) cookie_jar.extract_cookies(response, request) html = response.read() response.close() try: cookie_jar.save() except Exception: pass return html # Filter links found in the Google result pages HTML code. # Returns None if the link doesn't yield a valid result. def filter_result(link): try: # Decode hidden URLs. if link.startswith('/url?'): o = urlparse(link, 'http') link = parse_qs(o.query)['q'][0] # Valid results are absolute URLs not pointing to a Google domain, # like images.google.com or googleusercontent.com for example. # TODO this could be improved! o = urlparse(link, 'http') if o.netloc and 'google' not in o.netloc: return link # On error, return None. except Exception: pass # Returns a generator that yields URLs. def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0, stop=None, pause=2.0, country='', extra_params=None, user_agent=None, verify_ssl=True): """ Search the given query string using Google. :param str query: Query string. Must NOT be url-encoded. :param str tld: Top level domain. :param str lang: Language. :param str tbs: Time limits (i.e "qdr:h" => last hour, "qdr:d" => last 24 hours, "qdr:m" => last month). :param str safe: Safe search. :param int num: Number of results per page. :param int start: First result to retrieve. :param int stop: Last result to retrieve. Use None to keep searching forever. :param float pause: Lapse to wait between HTTP requests. A lapse too long will make the search slow, but a lapse too short may cause Google to block your IP. Your mileage may vary! :param str country: Country or region to focus the search on. Similar to changing the TLD, but does not yield exactly the same results. Only Google knows why... :param dict extra_params: A dictionary of extra HTTP GET parameters, which must be URL encoded. For example if you don't want Google to filter similar results you can set the extra_params to {'filter': '0'} which will append '&filter=0' to every query. :param str user_agent: User agent for the HTTP requests. Use None for the default. :param bool verify_ssl: Verify the SSL certificate to prevent traffic interception attacks. Defaults to True. :rtype: generator of str :return: Generator (iterator) that yields found URLs. If the stop parameter is None the iterator will loop forever. """ # Set of hashes for the results found. # This is used to avoid repeated results. hashes = set() # Count the number of links yielded. count = 0 # Prepare the search string. query = quote_plus(query) # If no extra_params is given, create an empty dictionary. # We should avoid using an empty dictionary as a default value # in a function parameter in Python. if not extra_params: extra_params = {} # Check extra_params for overlapping. for builtin_param in url_parameters: if builtin_param in extra_params.keys(): raise ValueError( 'GET parameter "%s" is overlapping with \ the built-in GET parameter', builtin_param ) # Grab the cookie from the home page. get_page(url_home % vars(), user_agent, verify_ssl) # Prepare the URL of the first request. if start: if num == 10: url = url_next_page % vars() else: url = url_next_page_num % vars() else: if num == 10: url = url_search % vars() else: url = url_search_num % vars() # Loop until we reach the maximum result, if any (otherwise, loop forever). while not stop or count < stop: # Remeber last count to detect the end of results. last_count = count # Append extra GET parameters to the URL. # This is done on every iteration because we're # rebuilding the entire URL at the end of this loop. for k, v in extra_params.items(): k = quote_plus(k) v = quote_plus(v) url = url + ('&%s=%s' % (k, v)) # Sleep between requests. # Keeps Google from banning you for making too many requests. time.sleep(pause) # Request the Google Search results page. html = get_page(url, user_agent, verify_ssl) # Parse the response and get every anchored URL. if is_bs4: soup = BeautifulSoup(html, 'html.parser') else: soup = BeautifulSoup(html) try: anchors = soup.find(id='search').findAll('a') # Sometimes (depending on the User-agent) there is # no id "search" in html response... except AttributeError: # Remove links of the top bar. gbar = soup.find(id='gbar') if gbar: gbar.clear() anchors = soup.findAll('a') # Process every anchored URL. for a in anchors: # Get the URL from the anchor tag. try: link = a['href'] except KeyError: continue # Filter invalid links and links pointing to Google itself. link = filter_result(link) if not link: continue # Discard repeated results. h = hash(link) if h in hashes: continue hashes.add(h) # Yield the result. yield link # Increase the results counter. # If we reached the limit, stop. count += 1 if stop and count >= stop: return # End if there are no more results. # XXX TODO review this logic, not sure if this is still true! if last_count == count: break # Prepare the URL for the next request. start += num if num == 10: url = url_next_page % vars() else: url = url_next_page_num % vars() # Shortcut to single-item search. # Evaluates the iterator to return the single URL as a string. def lucky(*args, **kwargs): """ Shortcut to single-item search. Same arguments as the main search function, but the return value changes. :rtype: str :return: URL found by Google. """ return next(search(*args, **kwargs))