pax_global_header00006660000000000000000000000064135673167210014525gustar00rootroot0000000000000052 comment=3987b744dfcec7ab0e36a233e61f138fc4426051 googler-4.0/000077500000000000000000000000001356731672100130265ustar00rootroot00000000000000googler-4.0/.circleci/000077500000000000000000000000001356731672100146615ustar00rootroot00000000000000googler-4.0/.circleci/config.yml000066400000000000000000000033231356731672100166520ustar00rootroot00000000000000version: 2 test-template: &test-template working_directory: ~/googler environment: NUM_TEST_ITERATIONS: 30 SLEEP_DURATION: 3 steps: - run: apt update && apt install -y --no-install-recommends git wamerican - checkout - run: ./tests/test --ci jobs: py35: docker: - image: python:3.5-slim <<: *test-template py36: docker: - image: python:3.6-slim <<: *test-template py37: docker: - image: python:3.7-slim <<: *test-template py38: docker: - image: python:3.8-slim <<: *test-template package-and-publish: machine: true working_directory: ~/googler steps: - checkout - run: name: "package with packagecore" command: | # Use latest installed python3 from pyenv export PYENV_VERSION="$(pyenv versions | grep -Po '\b3\.\d+\.\d+' | tail -1)" pip install packagecore packagecore -c tests/packagecore/packagecore.yaml -o ./dist/ ${CIRCLE_TAG#v} - run: name: "publish to GitHub" command: | go get github.com/tcnksm/ghr ghr -t ${GITHUB_API_TOKEN} -u ${CIRCLE_PROJECT_USERNAME} -r ${CIRCLE_PROJECT_REPONAME} -c ${CIRCLE_SHA1} -replace ${CIRCLE_TAG} ./dist/ workflows: version: 2 test: jobs: &all-tests - py35 - py36 - py37 - py38 weekly: triggers: - schedule: cron: "0 0 * * 6" filters: branches: only: - master jobs: *all-tests publish-github-release: jobs: - package-and-publish: filters: tags: only: /^v.*/ branches: ignore: /.*/ googler-4.0/.github/000077500000000000000000000000001356731672100143665ustar00rootroot00000000000000googler-4.0/.github/FUNDING.yml000066400000000000000000000001331356731672100162000ustar00rootroot00000000000000custom: https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=RMLTQ76JSXJ4Q googler-4.0/.github/ISSUE_TEMPLATE.md000066400000000000000000000024111356731672100170710ustar00rootroot00000000000000#### Bug reports Before opening an issue, please 💥💥TRY TO REPRODUCE ON [THE LATEST DEVELOPMENT VERSION][master] FIRST💥💥 The bug you noticed might have already been fixed. [master]: https://github.com/jarun/googler#downloading-a-single-file If the issue can be reproduced on master, then please make sure you provide the following: - Output of `googler -d`; - Link to the response body (you should see a line like `[DEBUG] Response body written to '/Volumes/ramdisk/googler-response-xxxxxxxx'` in the output of `googler -d`; please upload the file to a [gist](https://gist.github.com/) and include the gist's URL in the issue); - Details of operating system, Python version used, terminal emulator and shell; - `locale` output, if relevant. It's a good idea to set your locale to UFT-8. Please refer to [googler #131](https://github.com/jarun/googler/issues/131). If we need more information and there is no communication from the bug reporter within 7 days from the date of request, we will close the issue. If you have relevant information, resume discussion any time. #### Feature requests Please consider contributing the feature back to `googler` yourself. Feel free to discuss. We are more than happy to help. --- PLEASE DELETE THIS LINE AND EVERYTHING ABOVE --- googler-4.0/.github/lock.yml000066400000000000000000000000721356731672100160400ustar00rootroot00000000000000daysUntilLock: 60 lockComment: false setLockReason: false googler-4.0/.gitignore000066400000000000000000000000231356731672100150110ustar00rootroot00000000000000build/ dist/ *.bak googler-4.0/CHANGELOG000066400000000000000000000303651356731672100142470ustar00rootroot00000000000000googler 4.0 2019-11-27 What's in? - Switch to modern UA and fix parser to handle new result format - Text-wrapping ------------------------------------------------------------------------------- googler 3.9 2019-05-30 What's in? - fix issue - `googler` showing "No results." - show matched keywords in bold in result abstracts - option `--colorize` for more control on colors - better support for colors on Windows - switch to CircleCI from Travis - option `--noua` is deprecated (noop) and will be removed in future ------------------------------------------------------------------------------- googler 3.8 2019-03-27 What's in? - A complete parser rewrite - Visual redesign of the output format - Text-wrapping for CJK wide characters - Refresh current page on URL expansion toggle - Available on Raspbian testing and Chocolatey - Several important fixes ------------------------------------------------------------------------------- googler 3.7.1 2018-10-10 What's in? - Custom user agent - Fix to unescape auto-completions ------------------------------------------------------------------------------- googler 3.7 2018-09-16 What's in? - Support xclip as a clipboard utility on *nix - Support GNU Screen and tmux as clipboard fallback - Support Termux clipboard on Android ------------------------------------------------------------------------------- googler 3.6 2018-05-23 What's in? - Decode auto-completion info as per charset in response header - Ignore trailing `/` in proxy - Some heath sites added to googler @t - User agent updated to Firefox 60 - Availability on Fedora and openSUSE - More auto-generated packages ------------------------------------------------------------------------------- googler 3.5 2018-02-16 What's in? - URL folding to show only domain name - Omniprompt key `c` to copy URL to clipboard - Support env var `DISABLE_PROMPT_COLOR` to disable prompt color (see #203) Note: Python 3.3 reached EOL, will not be supported anymore. ------------------------------------------------------------------------------- googler 3.4 2017-10-02 What's in? - Support custom URL handler script or cli utility (option `--url-handler`) - Support text browser override with GUI browser (omniprompt key `O`) - A stunning project logo! (designed by @zmwangx) ------------------------------------------------------------------------------- googler 3.3 2017-08-17 What's in? - Search auto-completion (using completion scripts) - Python 3.6 support - Automated release package builds using PackageCore ------------------------------------------------------------------------------- googler 3.2 2017-07-07 What's in? - Basic authentication with `--proxy` - Option `--unfilter` to include similar results - New googler @ts : Manga Reader, Mac Rumors, OMG! Ubuntu! - Fix: skip certain card results with `--noua` - options `--json` and `--exact` decoupled ------------------------------------------------------------------------------- googler 3.1 2017-04-28 What's in? - Search result metadata (e.g. IMDB rating) - Multi-site search - Browse numeric ranges at omniprompt - googler@ - Financial Times, The Pirate Bay added ------------------------------------------------------------------------------- googler 3.0 2017-03-12 Modifications - Introducing [googler @t](https://github.com/jarun/googler#googler-t) add-on! - Open multiple indices from omniprompt - Open all indices from omniprompt - Option `--enable-browser-output` is now `--show-browser-logs` - Multiple bug fixes ------------------------------------------------------------------------------- googler 2.9 2016-12-18 **NOTICE** - `googler` is on Debian and Ubuntu official releases now. In addition, there's a PPA in place to install the latest program releases from. Modifications - Omniprompt option to search exact keywords on auto-correction - Push cmdline arguments to readline history (simplifies editing the keywords) - Added check to ensure UTF-8 encoding - Support 3 HTTP redirections before failing to connect - Support environment variable https_proxy - Python 3.5.3 compliance for TLS 1.2 - Removed deb package generation scripts ------------------------------------------------------------------------------- googler 2.8 2016-10-04 Modifications - Add option --notweak to disable TCP optimizations and forced TLS 1.2. - Limited self-upgrade options to -U or --upgrade. Removed --update. ------------------------------------------------------------------------------- googler v2.7 2016-08-28 Modifications - Show google services abstract with User Agent disabled. - In-place self-upgrade mechanism. - Fix integration with text-based browsers. - Set process title to googler if setproctitle is installed. ------------------------------------------------------------------------------- googler v2.6 2016-07-06 Modifications - Option `--noua` to disable UA (default - enabled). - Logging and auto-completion script changes. ------------------------------------------------------------------------------- googler v2.5.1 2016-06-13 Modifications - Enable TCP/IP optimizations only for Linux. This fails on OS X. NOTE: The optimizations do not work on Linux 2.4 and earlier either. ------------------------------------------------------------------------------- googler v2.5 2016-06-12 **NOTICE:** - Python 2.x support is discontinued. - googler is now available on [Debian Sid](https://packages.debian.org/unstable/main/googler) Modifications - Invoking `googler` without search keywords shows omniprompt - Introduced options -h and --help to show program help and exit - Support cookie - Use TLS 1.2 (Python 3.4 and above) - Omniprompt key to unfilter filtered similar results - HTTPS proxy support (non-TLS 1.2 supported) - News time shown in cyan by default - Tons of code, logging and debug improvements (thanks Zhiming) ------------------------------------------------------------------------------- googler v2.4.1 2016-05-22 **NOTICE:** Python 2.x support is deprecated now. While it's still possible to use Python 2.x by editing the shebang, we have found issues with Python 2.x (e.g. readline doesn't work) which don't have a satisfactory solution without impacting other features. Python 2.x support will be completely removed in the next version. Modifications - Sitelinks support - Customizable colours - Context in News results - .deb package for Debian and Ubuntu family - Basic support for terminal emulators having ANSI escape sequence support on Windows - New omniprompt option -f to jump to first results page - New omniprompt key -o to open the current search in browser - Shorter omniprompt - Non-interactive mode to fetch results and exit - JSON output support - A complete re-write of the HTML parser ------------------------------------------------------------------------------- googler v2.3 2016-04-23 Modifications - Google Site Search support (option -w) - Auto-completion scripts for Zsh, Bash and Fish shells - All Google top level domains supported - Show time for news - Integrated omniprompt help - Move to argparse - Additional long options easier to remember - Graceful SIGINT handler - Add version to debug logs AND ... - An *awesssome* asciinema recording for the README from Zhiming ------------------------------------------------------------------------------- googler v2.2 2016-03-12 Modifications - Show quotes in text and title - Option to disable automatic spelling correction - User agent identifier added for all requests - Improved concise omniprompt with color inversion to work as a page separator - Set column size to auto when sys.stderr is not a tty - Decode HTTPS response in UTF-8 - Dynamically detect python version using /usr/bin/env - Handle EOF (Ctrl-d) at omniprompt Improvements - Refactored code - Modularized code for repetitive logic - Unnecessary code removal - Dump full HTML response in debug mode - Homebrew integration - Travis integration - A better readme in 100% markdown and ToC with references ------------------------------------------------------------------------------- googler v2.1 2016-02-01 Modifications - Project renamed to googler, same as the utility - Gzip compression to fetch data - Improved continuous search (works without the `g` key at prompt now. Check Example 10 in README for exceptions) - Skip Google News, Images links and ads - Show skipped link count ------------------------------------------------------------------------------- google-cli v2.0 2016-01-09 Modifications - IMPORTANT fix for issue #19: Google replaced "li" with "div" as search result separator. Users must update to this release or latest dev version for google-cli to work. - Handle formatting on Mac OS X in emacs eshell (or any terminal envornment where number of columns returned is 0). - PEP 8 style adaptation. Thanks @shaggytwodope! ------------------------------------------------------------------------------- google-cli v1.9 2015-11-13 Modifications - Skip results without any URL (Google custom results like time, define etc.). - Use readline library to support arrow keys in input. - Support installation on OSX. Thanks @ibaaj. - Pre-check negative index before attempting to open URL. - Handle exception: "socket.gaierror: [Errno -2] Name or service not known" due to connection throttle on low-bandwidth. - Print correct Exception in case of connection timeout. ------------------------------------------------------------------------------- google-cli v1.8 2015-10-11 Modification - Added timeout to HTTPSConnection() - Redirected stdout and stderr to suppress all warning & error messages when opening results in Firefox ------------------------------------------------------------------------------- google-cli v1.7 2015-10-07 Modification - Added support for redirection and piping - Used stderr instead of stdin to determine console geometry ------------------------------------------------------------------------------- google-cli v1.6 2015-09-12 Modification - Changed incremental search key from s to g keeping in mind that users may use g as the alias for googler. Fix - Handle httplib.BadStatusLine exception. This happens if the connection is closed due to inactivity. Now googler will reconnect and re-issue the search. ------------------------------------------------------------------------------- google-cli v1.5 2015-09-04 New capabilities - Incremental search support from the same running instance - Utility name changed to googler to void any copyright infringements ------------------------------------------------------------------------------- google-cli v1.2 2015-09-03 New capabilities - Open result in browser using index number (thanks jeremija) - Google News support - Time limit search by hours - Country specific search (28 top-level domains added) - Add switch to enable debug logs Removal - Removed file type specific search option -f in favour of filetype:mime Google keyword Fixes - Convert %22 to " (double quote) in URLs - Inputs other than n, p or number (+ Enter) exit - Fix failure to open URL with " (double quotes) in browser - Fix version information in manpage - Get rid of Google Chrome debug/error messages in console when opening URL ------------------------------------------------------------------------------- google-cli v1.1 2015-08-25 New capabilities - Add Python 3.x support - Add UTF-8 request and response [both the contributions are from Narrat] NOTE: The next change in queue is to support opening the URLs in browser. As we can see during preliminary tests, there are several issues around Google Chrome and its mods. This release works as a stable release before we hop on. ------------------------------------------------------------------------------- google-cli v1.0 2015-08-22 New capabilities - HTTPS support - Navigate as in regular google search - File type in search as an option - Time limited search (day, week, month, year) - Show full text snippet of search results - Unicode in URL support - Honour -j even if -n is not used and open the result in browser - Skip browser to show result in console for empty URL, e.g., first result of 'define hello' - Handle google redirections (error 302) - Throw error in case of google error due to unusual activity from IP Fixes - Adapt to new google HTML response - Fixed character encoding problem in URL e.g. double quotes (%22) changed to %2522 ------------------------------------------------------------------------------- googler-4.0/LICENSE000066400000000000000000001045131356731672100140370ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . googler-4.0/Makefile000066400000000000000000000015301356731672100144650ustar00rootroot00000000000000PREFIX ?= /usr/local BINDIR = $(DESTDIR)$(PREFIX)/bin MANDIR = $(DESTDIR)$(PREFIX)/share/man/man1 DOCDIR = $(DESTDIR)$(PREFIX)/share/doc/googler .PHONY: all install uninstall disable-self-upgrade all: install: install -m755 -d $(BINDIR) install -m755 -d $(MANDIR) install -m755 -d $(DOCDIR) gzip -c googler.1 > googler.1.gz install -m755 googler $(BINDIR) install -m644 googler.1.gz $(MANDIR) install -m644 README.md $(DOCDIR) rm -f googler.1.gz uninstall: rm -f $(BINDIR)/googler rm -f $(MANDIR)/googler.1.gz rm -rf $(DOCDIR) # Disable the self-upgrade mechanism entirely. Intended for packagers. # # We assume that sed(1) has the -i option, which is not POSIX but seems common # enough in modern implementations. disable-self-upgrade: sed -i.bak 's/^ENABLE_SELF_UPGRADE_MECHANISM = True$$/ENABLE_SELF_UPGRADE_MECHANISM = False/' googler googler-4.0/README.md000066400000000000000000000547301356731672100143160ustar00rootroot00000000000000

googler

Latest release AUR Homebrew Debian Stretch+ Fedora 27+ openSUSE Leap 15.0+ Ubuntu Yakkety+

Availability License Build Status

Asciicast

`googler` is a power tool to Google (Web & News) and Google Site Search from the command-line. It shows the title, URL and abstract for each result, which can be directly opened in a browser from the terminal. Results are fetched in pages (with page navigation). Supports sequential searches in a single `googler` instance. `googler` was initially written to cater to headless servers without X. You can integrate it with a text-based browser. However, it has grown into a very handy and flexible utility that delivers much more. For example, fetch any number of results or start anywhere, limit search by any duration, define aliases to google search any number of websites, switch domains easily... all of this in a very clean interface without ads or stray URLs. The shell completion scripts make sure you don't need to remember any options. `googler` isn't affiliated to Google in any way. More fun stuff you can try with `googler`: - [googler on the iPad](https://github.com/jarun/googler/wiki/googler-on-the-iPad) - [Print content of results to terminal or listen to it](https://github.com/jarun/googler/wiki/Print-content-of-results-to-terminal-or-listen-to-it) - [Terminal Reading Mode or Reader View](https://github.com/jarun/googler/wiki/Terminal-Reading-Mode-or-Reader-View) - [Stream YouTube videos on desktop](https://github.com/jarun/googler/wiki/Stream-YouTube-videos-on-desktop) - [Search error on StackOverflow from terminal](https://github.com/jarun/googler/wiki/Search-error-on-StackOverflow-from-terminal) *Love smart and efficient utilities? Explore [my repositories](https://github.com/jarun?tab=repositories). Buy me a cup of coffee if they help you.*

Donate via PayPal!

### Table of contents - [Features](#features) - [Installation](#installation) - [Dependencies](#dependencies) - [From a package manager](#from-a-package-manager) - [Tips for packagers](#tips-for-packagers) - [Release packages](#release-packages) - [From source](#from-source) - [Running standalone](#running-standalone) - [Downloading a single file](#downloading-a-single-file) - [Shell completion](#shell-completion) - [Usage](#usage) - [Cmdline options](#cmdline-options) - [Configuration file](#configuration-file) - [googler @t](#googler-t) - [Text-based browser integration](#text-based-browser-integration) - [Colors](#colors) - [Domain-only URL](#domain-only-url) - [Examples](#examples) - [Troubleshooting](#troubleshooting) - [Notes](#notes) - [Contributions](#contributions) - [Developers](#developers) ### Features - Google Search, Google Site Search, Google News - Fast and clean (no ads, stray URLs or clutter), custom color - Navigate result pages from omniprompt, open URLs in browser - Effortless keyword-based site search with googler @t add-on - Search and option completion scripts for Bash, Zsh and Fish - Fetch n results in a go, start at the nth result - Disable automatic spelling correction and search exact keywords - Specify duration, country/domain (default: worldwide/.com), language - Google keywords (e.g. `filetype:mime`, `site:somesite.com`) support - Open the first result directly in browser (as in *I'm Feeling Lucky*) - Non-stop searches: fire new searches at omniprompt without exiting - HTTPS proxy, User Agent, TLS 1.2 (default) support - Comprehensive documentation, man page with handy usage examples - Minimal dependencies ### Installation #### Dependencies `googler` requires Python 3.5 or later. Only the latest patch release of each minor version is supported. To copy url to clipboard at the omniprompt, `googler` looks for `xsel` or `xclip` or `termux-clipboard-set` (in the same order) on Linux, `pbcopy` (default installed) on macOS and `clip` (default installed) on Windows. It also supports GNU Screen and tmux copy-paste buffers in the absence of X11. #### From a package manager - [AUR](https://aur.archlinux.org/packages/googler/) (`yay -S googler`) - [Chocolatey](https://chocolatey.org/packages/googler) (`choco install googler`) - [Debian](https://packages.debian.org/search?keywords=googler&searchon=names) (`apt-get install googler`) - [Fedora](https://apps.fedoraproject.org/packages/googler) (`dnf install googler`) - [FreeBSD](https://www.freshports.org/www/googler/) (`pkg install googler`) - [macOS/Homebrew](http://formulae.brew.sh/formula/googler) (`brew install googler`) - [NixOS](https://github.com/NixOS/nixpkgs/tree/master/pkgs/applications/misc/googler) (`nix-env -i googler`) - [openSUSE](https://software.opensuse.org/search?q=googler) (`zypper in googler`) - [Raspbian Testing](https://archive.raspbian.org/raspbian/pool/main/g/googler/) (`apt-get install googler`) - [Slackware](http://slackbuilds.org/repository/14.2/network/googler/) (`slackpkg install googler`) - [Snap Store](https://snapcraft.io/googler) (`snap install googler`) - [Ubuntu](https://packages.ubuntu.com/search?keywords=googler&searchon=names) (`apt-get install googler`) - [Void Linux](https://github.com/void-linux/void-packages/blob/master/srcpkgs/googler/template) (`xbps-install -S googler`) ##### Tips for packagers `googler` v2.7 and later ships with an in-place self-upgrade mechanism which you may want to disable. To do this, run $ make disable-self-upgrade before installation. #### Release packages Packages for Arch Linux, CentOS, Debian, Fedora, openSUSE and Ubuntu are available with the [latest stable release](https://github.com/jarun/googler/releases/latest). #### From source If you have git installed, clone this repository. Otherwise download the [latest stable release](https://github.com/jarun/googler/releases/latest) or [development version](https://github.com/jarun/googler/archive/master.zip). To install to the default location (`/usr/local`): $ sudo make install To remove `googler` and associated docs, run $ sudo make uninstall `PREFIX` is supported, in case you want to install to a different location. #### Running standalone `googler` is a standalone executable (and can run even on environments like Termux). From the containing directory: $ ./googler #### Downloading a single file `googler` is a single standalone script, so you could download just a single file if you'd like to. To install the latest stable version, run $ sudo curl -o /usr/local/bin/googler https://raw.githubusercontent.com/jarun/googler/v4.0/googler && sudo chmod +x /usr/local/bin/googler You could then let googler upgrade itself by running $ sudo googler -u Similarly, if you want to install from git master (*risky*), run $ sudo curl -o /usr/local/bin/googler https://raw.githubusercontent.com/jarun/googler/master/googler && sudo chmod +x /usr/local/bin/googler and upgrade by running $ sudo googler -u --include-git ### Shell completion Search keyword and option completion scripts for Bash, Fish and Zsh can be found in respective subdirectories of [`auto-completion/`](auto-completion). Please refer to your shell's manual for installation instructions. ### Usage #### Cmdline options ``` usage: googler [-h] [-s N] [-n N] [-N] [-c TLD] [-l LANG] [-x] [--colorize [{auto,always,never}]] [-C] [--colors COLORS] [-j] [-t dN] [-w SITE] [--unfilter] [-p PROXY] [--noua] [--notweak] [--json] [--url-handler UTIL] [--show-browser-logs] [--np] [-u] [--include-git] [-v] [-d] [KEYWORD [KEYWORD ...]] Google from the command-line. positional arguments: KEYWORD search keywords optional arguments: -h, --help show this help message and exit -s N, --start N start at the Nth result -n N, --count N show N results (default 10) -N, --news show results from news section -c TLD, --tld TLD country-specific search with top-level domain .TLD, e.g., 'in' for India -l LANG, --lang LANG display in language LANG -x, --exact disable automatic spelling correction --colorize [{auto,always,never}] whether to colorize output; defaults to 'auto', which enables color when stdout is a tty device; using --colorize without an argument is equivalent to --colorize=always -C, --nocolor equivalent to --colorize=never --colors COLORS set output colors (see man page for details) -j, --first, --lucky open the first result in web browser and exit -t dN, --time dN time limit search [h5 (5 hrs), d5 (5 days), w5 (5 weeks), m5 (5 months), y5 (5 years)] -w SITE, --site SITE search a site using Google --unfilter do not omit similar results -p PROXY, --proxy PROXY tunnel traffic through an HTTP proxy; PROXY is of the form [http://][user:password@]proxyhost[:port] --noua legacy option (no effect) --notweak disable TCP optimizations and forced TLS 1.2 --json output in JSON format; implies --noprompt --url-handler UTIL custom script or cli utility to open results --show-browser-logs do not suppress browser output (stdout and stderr) --np, --noprompt search and exit, do not prompt -u, --upgrade perform in-place self-upgrade --include-git when used with --upgrade, get latest git master -v, --version show program's version number and exit -d, --debug enable debugging omniprompt keys: n, p fetch the next or previous set of search results index open the result corresponding to index in browser f jump to the first page o [index|range|a ...] open space-separated result indices, numeric ranges (sitelinks unsupported in ranges), or all, in browser open the current search in browser, if no arguments O [index|range|a ...] like key 'o', but try to open in a GUI browser g keywords new Google search for 'keywords' with original options should be used to search omniprompt keys and indices c index copy url to clipboard u toggle url expansion q, ^D, double Enter exit googler ? show omniprompt help * other inputs issue a new search with original options ``` #### Configuration file `googler` doesn't have any! This is to retain the speed of the utility and avoid OS-specific differences. Users can enjoy the advantages of config files using aliases (with the exception of the color scheme, which can be additionally customized through an environment variable; see [Colors](#colors)). There's no need to memorize options. For example, the following alias for bash/zsh/ksh/etc. alias g='googler -n 7 -c ru -l ru' fetches 7 results from the Google Russia server, with preference towards results in Russian. The alias serves both the purposes of using config files: - Persistent settings: when the user invokes `g`, it expands to the preferred settings. - Override settings: thanks to the way Python `argparse` works, `googler` is written so that the settings in alias are completely overridden by any options passed from cli. So when the same user runs `g -l de -c de -n 12 hello world`, 12 results are returned from the Google Germany server, with preference towards results in German. #### googler @t `googler @t` is a convenient add-on to Google Site Search with unique keywords. While `googler` has an integrated option to search a site, we simplified it further with aliases. The file [googler_at](https://github.com/jarun/googler/blob/master/auto-completion/googler_at/googler_at) contains a list of website search aliases. To source it, run: $ source googler_at or, $ . googler_at With `googler @t`, here's how you search Wikipedia for `hexspeak`: $ @w hexspeak Oh yes! You can combine other `googler` options too! To make life easier, you can also configure your shell to source the file when it starts. All the aliases start with the `@` symbol (hence the name `googler @t`) and there is minimum chance they will conflict with any shell commands. Feel free to add your own aliases to the file and contribute back the interesting ones. #### Text-based browser integration `googler` works out of the box with several text-based browsers if the `BROWSER` environment variable is set. For instance, $ export BROWSER=w3m or for one-time use, $ BROWSER=w3m googler query Due to certain graphical browsers spewing messages to the console, `googler` suppresses browser output by default unless `BROWSER` is set to one of the known text-based browsers: currently `elinks`, `links`, `lynx`, `w3m` or `www-browser`. If you use a different text-based browser, you will need to explicitly enable browser output with the `--show-browser-logs` option. If you believe your browser is popular enough, please submit an issue or pull request and we will consider whitelisting it. See the man page for more details on `--show-browser-logs`. If you need to use a GUI browser with `BROWSER` set, use the omniprompt key `O`. `googler` will try to ignore text-based browsers and invoke a GUI browser. Browser logs are always suppressed with `O`. #### Colors `googler` allows you to customize the color scheme via a six-letter string, reminiscent of BSD `LSCOLORS`. The six letters represent the colors of - indices - titles - URLs - metadata/publishing info (Google News only) - abstracts - prompts respectively. The six-letter string is passed in either as the argument to the `--colors` option, or as the value of the environment variable `GOOGLER_COLORS`. We offer the following colors/styles: Letter | Color/Style ------ | ----------- a | black b | red c | green d | yellow e | blue f | magenta g | cyan h | white i | bright black j | bright red k | bright green l | bright yellow m | bright blue n | bright magenta o | bright cyan p | bright white A-H | bold version of the lowercase-letter color I-P | bold version of the lowercase-letter bright color x | normal X | bold y | reverse video Y | bold reverse video The default colors string is `GKlgxy`, which stands for - bold bright cyan indices - bold bright green titles - bright yellow URLs - cyan metadata/publishing info - normal abstracts - reverse video prompts Note that - Bright colors (implemented as `\x1b[90m`–`\x1b[97m`) may not be available in all color-capable terminal emulators; - Some terminal emulators draw bold text in bright colors instead; - Some terminal emulators only distinguish between bold and bright colors via a default-off switch. Please consult the manual of your terminal emulator as well as the [Wikipedia article](https://en.wikipedia.org/wiki/ANSI_escape_code) on ANSI escape sequences. #### Domain-only URL To show the domain names in search results instead of the expanded URL (and use lesser space), set the environment variable `DISABLE_URL_EXPANSION`. ### Examples 1. Google **hello world**: $ googler hello world 2. Fetch **15 results** updated within the last **14 months**, starting from the **3rd result** for the keywords **jungle book** in **site** imdb.com: $ googler -n 15 -s 3 -t m14 -w imdb.com jungle book 3. Read recent **news** on gadgets: $ googler -N gadgets 4. Fetch results on IPL cricket from **Google India** server in **English**: $ googler -c in -l en IPL cricket 5. Search **quoted text**: $ googler it\'s a \"beautiful world\" in spring 6. Search for a **specific file type**: $ googler instrumental filetype:mp3 7. Disable **automatic spelling correction**, e.g. fetch results for `googler` instead of `google`: $ googler -x googler 8. **I'm feeling lucky** search: $ googler -j leather jackets 9. **Website specific** search: $ googler -w amazon.com -w ebay.com digital camera Site specific search continues at omniprompt. 10. Alias to find **definitions of words**: alias define='googler -n 2 define' 11. Look up `n`, `p`, `o`, `O`, `q`, `g keywords` or a result index at the **omniprompt**: as the omniprompt recognizes these keys or index strings as commands, you need to prefix them with `g`, e.g., g n g g keywords g 1 12. Input and output **redirection**: $ googler -C hello world < input > output Note that `-C` is required to avoid printing control characters (for colored output). 13. **Pipe** output: $ googler -C hello world | tee output 14. Use a **custom color scheme**, e.g., a warm color scheme designed for Solarized Dark ([screenshot](https://i.imgur.com/6L8VlfS.png)): $ googler --colors bjdxxy google $ GOOGLER_COLORS=bjdxxy googler google 15. Tunnel traffic through an **HTTPS proxy**, e.g., a local Privoxy instance listening on port 8118: $ googler --proxy localhost:8118 google By default the environment variable `https_proxy` is used, if defined. 16. Quote multiple search keywords to auto-complete (using completion script): $ googler 'hello w 17. More **help**: $ googler -h $ man googler ### Troubleshooting 1. In some instances `googler` may show fewer number of results than you expect, e.g., if you fetch a single result (`-n 1`) it may not show any results. The reason is Google shows some Google service (e.g. Youtube) results, map locations etc. depending on your geographical data, which `googler` tries to omit. In some cases Google (the web-service) doesn't show exactly 10 results (default) on a search. We chose to omit these results as far as possible. While this can be fixed, it would need more processing (and more time). You can just navigate forward to fetch the next set of results. 2. By default `googler` applies some TCP optimizations and forces TLS 1.2 (on Python 3.4 and above). If you are facing connection issues, try disabling both using the `--notweak` switch. 3. Google News service is not available if the language is `dk` (Denmark), `fi` (Finland) or `is` (Iceland). Use `-l en`. Please refer to #187 for more information. 4. Some users have reported problems with a colored omniprompt (refer to issue [#203](https://github.com/jarun/googler/issues/203)) with iTerm2 on macOS. To force a plain omniprompt: export DISABLE_PROMPT_COLOR=1 ### Notes 1. Initially I raised a pull request but I could see that the last change was made 7 years earlier. In addition, there is no GitHub activity from the original author [Henri Hakkinen](https://github.com/henux) in past year. I have created this independent repo for the project with the name `googler`. I retained the original copyright information (though `googler` is organically different now). 2. Google provides a search API which returns the results in JSON format. However, as per my understanding from the [official docs](https://developers.google.com/custom-search/json-api/v1/overview), the API issues the queries against an existing instance of a custom search engine and is limited by 100 search queries per day for free. In addition, I have reservations in paying if they ever change their plan or restrict the API in other ways. So I refrained from coupling with Google plans & policies or exposing my trackable personal custom search API key and identifier for the public. I retained the browser-way of doing it by fetching html, which is a open and free specification. 3. You can find a rofi script for `googler` [here](http://hastebin.com/fonowacija.bash). Written by an anonymous user, untested and we don't maintain it. ### Contributions Pull requests are welcome. Please visit [#209](https://github.com/jarun/googler/issues/209) for a list of TODOs.

gitter chat

### Developers 1. Copyright © 2008 Henri Hakkinen 2. Copyright © 2015-2019 [Arun Prakash Jana](https://github.com/jarun) 3. [Zhiming Wang](https://github.com/zmwangx) 4. [Johnathan Jenkins](https://github.com/shaggytwodope) 5. [SZ Lin](https://github.com/szlin) Special thanks to [jeremija](https://github.com/jeremija) and [Narrat](https://github.com/Narrat) for their contributions. ### Logo Logo copyright © 2017 Zhiming Wang. You may freely redistribute it alongside the code, or use it when describing or linking to this project. You should NOT create modified versions of it, make it the logo or icon of your project (except personal forks and/or forks with the goal of upstreaming), or otherwise use it without written permission. googler-4.0/auto-completion/000077500000000000000000000000001356731672100161455ustar00rootroot00000000000000googler-4.0/auto-completion/bash/000077500000000000000000000000001356731672100170625ustar00rootroot00000000000000googler-4.0/auto-completion/bash/googler-completion.bash000066400000000000000000000030641356731672100235310ustar00rootroot00000000000000# # Rudimentary Bash completion definition for googler. # # Author: # Zhiming Wang # _googler () { COMPREPLY=() local IFS=$' \n' local cur=$2 prev=$3 local -a opts opts_with_args opts=( -h --help -s --start -n --count -N --news -c --tld -l --lang -x --exact --colorize -C --nocolor --colors -j --first --lucky -t --time -w --site --unfilter -p --proxy --notweak --json --url-handler --show-browser-logs --np --noprompt -u --upgrade --include-git -v --version -d --debug ) opts_with_arg=( -s --start -n --count -c --tld -l --lang --colorize --colors -t --time -w --site -p --proxy --url-handler ) if [[ $cur == -* ]]; then # The current argument is an option -- complete option names. COMPREPLY=( $(compgen -W "${opts[*]}" -- "$cur") ) else # Do not complete option arguments; only autocomplete positional # arguments (queries). for opt in "${opts_with_arg[@]}"; do [[ $opt == $prev ]] && return 1 done local completion COMPREPLY=() while IFS= read -r completion; do # Quote spaces for `complete -W wordlist` COMPREPLY+=( "${completion// /\\ }" ) done < <(googler --complete "$cur") fi return 0 } complete -F _googler googler googler-4.0/auto-completion/fish/000077500000000000000000000000001356731672100170765ustar00rootroot00000000000000googler-4.0/auto-completion/fish/googler.fish000066400000000000000000000050741356731672100214150ustar00rootroot00000000000000# # Fish completion definition for googler. # # Author: # Arun Prakash Jana # function __fish_googler_non_option_argument not string match -- "-*" (commandline -ct) end function __fish_googler_complete_query googler --complete (commandline -ct) ^/dev/null end complete -c googler -s h -l help --description 'show help text and exit' complete -c googler -s s -l start -r --description 'start at the Nth result' complete -c googler -s n -l count -r --description 'show specified number of results (default 10)' complete -c googler -s N -l news --description 'show results from news section' complete -c googler -s c -l tld -r --description 'country-specific search with top-level domain' complete -c googler -s l -l lang -r --description 'display in specified language' complete -c googler -s x -l exact --description 'disable automatic spelling correction' complete -c googler -l colorize -r --description 'whether to colorize output (options: auto/always/never)' complete -c googler -s C -l nocolor --description 'disable color output' complete -c googler -l colors -r --description 'set output colors' complete -c googler -s j -l first -l lucky --description 'open the first result in a web browser' complete -c googler -s t -l time -r --description 'time limit search (h/d/w/m/y + number)' complete -c googler -s w -l site -r --description 'search a site using Google' complete -c googler -l unfilter --description 'do not omit similar results' complete -c googler -s p -l proxy -r --description 'proxy in HOST:PORT format' complete -c googler -l notweak --description 'disable TCP optimizations, forced TLS 1.2' complete -c googler -l json --description 'output in JSON format' complete -c googler -l url-handler -r --description 'cli script or utility' complete -c googler -l show-browser-logs --description 'do not suppress browser output' complete -c googler -l np -l noprompt --description 'perform search and exit' complete -c googler -s u -l upgrade --description 'perform in-place self-upgrade' complete -c googler -l include-git --description 'use git master for --upgrade' complete -c googler -s v -l version --description 'show version number and exit' complete -c googler -s d -l debug --description 'enable debugging' complete -c googler -n __fish_googler_non_option_argument -a '(__fish_googler_complete_query)' googler-4.0/auto-completion/googler_at/000077500000000000000000000000001356731672100202675ustar00rootroot00000000000000googler-4.0/auto-completion/googler_at/googler_at000066400000000000000000000167271356731672100223510ustar00rootroot00000000000000# googler @t alias list # Author: Arun Prakash Jana # email: engineerarun@gmail.com # # To request key addition or removal upstream, please drop an email. # A # Amazon.com alias @a='googler -w amazon.com' # AlternativeTo alias @alt='googler -w alternativeto.net' # Android Developers alias @android='googler -w developer.android.com' # ARM Information Center alias @arm='googler -w infocenter.arm.com' # asciinema alias @asciinema='googler -w asciinema.org' # Ask Ubuntu alias @askubuntu='googler -w askubuntu.com' # Arch Forums alias @abbs='googler -w bbs.archlinux.org' # Arch User Repository alias @aur='googler -w aur.archlinux.org' # Arch Wiki alias @aw='googler -w wiki.archlinux.org' # AZLyrics alias @azl='googler -w azlyrics.com' # B # BBC alias @bbc='googler -w bbc.co.uk' # Encyclopaedia Britannica alias @britannica='googler -w britannica.com' # C # crunchbase alias @cb='googler -w crunchbase.com' # Chrome Extensions alias @chrome='googler -w chrome.google.com' # craigslist alias @cl='googler -w craigslist.org' # commandlinefu alias @cmd='googler -w commandlinefu.com' # CNN alias @cnn='googler -w cnn.com' # Comedy Central alias @comedy='googler -w cc.com' # CPP Reference alias @cpp='googler -w en.cppreference.com' # Cracked.com alias @cracked='googler -w cracked.com' # Cricinfo alias @cricinfo='googler -w espncricinfo.com' # D # The Free Dictionary alias @d='googler -w thefreedictionary.com' # Dictionary.com alias @di='googler -w dictionary.com' # DistroWatch alias @distrowatch='googler -w distrowatch.com' # Daily Natural Remedies alias @dnr='googler -w dailynaturalremedies.com' # Debian Package Search alias @dpkg='googler -w packages.debian.org' # E # eBay alias @e='googler -w ebay.com' # Episode Guides alias @eg='googler -w epguides.com' # Embedded alias @embedded='googler -w embedded.com' # ESPN alias @espn='googler -w espn.com' # Etsy alias @etsy='googler -w etsy.com' # Online Etymology Dictionary alias @etym='googler -w etymonline.com' # F # Facebook alias @fb='googler -w facebook.com' # Fandango Movie Reviews alias @fd='googler -w fandango.com' # Firefox Add-ons alias @firefox='googler -w addons.mozilla.org' # Flipkart alias @fk='googler -w flipkart.com' # Forbes alias @forbes='googler -w forbes.com' # Forvo alias @forvo='googler -w forvo.com' # Financial Times alias @ft='googler -w markets.ft.com' # G # Google Search alias @g='googler' # Genius Lyrics alias @genius='googler -w genius.com' # GitHub alias @gh='googler -w github.com' # GNU alias @gnu='googler -w gnu.org' # Goal alias @goal='googler -w goal.com' # Goear Music alias @goear='googler -w goear.com' # The GNU Privacy Guard alias @gpg='googler -w gnupg.org' # Project Gutenberg alias @gutenberg='googler -w gutenberg.org' # H # Hackaday alias @had='googler -w hackaday.com' # Healthline alias @hl='googler -w healthline.com' # History alias @history='googler -w history.com' # Hacker News alias @hn='googler -w news.ycombinator.com' # HowStuffWorks alias @hsw='googler -w howstuffworks.com' # HowtoForge alias @htf='googler -w howtoforge.com' # Hulu alias @hulu='googler -w hulu.com' # I # IEEE alias @ieee='googler -w ieee.org' # IETF alias @ietf='googler -w ietf.org' # IETF Datatracker alias @ietfd='googler -w datatracker.ietf.org' # Instagram alias @ig='googler -w instagram.com' # IMDB alias @imdb='googler -w imdb.com' # Internet Radio alias @iradio='googler -w internet-radio.com' # J # K # The Linux Kernel Archives alias @kernel='googler -w kernel.org' # Khan Academy alias @khan='googler -w khanacademy.org' # L # Last.fm alias @lfm='googler -w last.fm' # LinkedIn alias @li='googler -w linkedin.com' # Linux.com alias @linux='googler -w linux.com' # Linux Journal alias @lj='googler -w linuxjournal.com' # LinuxQuestions alias @lq='googler -w linuxquestions.org' # LQWiki alias @lqw='googler -w wiki.linuxquestions.org' # LWN.net alias @lwn='googler -w lwn.net' # Linux Cross Reference alias @lxr='googler -w lxr.free-electrons.com' # M # Ubuntu Manpage alias @man='googler -w manpages.ubuntu.com' # Linux manual page alias @man7='googler -w man7.org' # Manga Reader alias @mangar='googler -w mangareader.net' # MLB alias @mlb='googler -w mlb.mlb.com' # Mac Rumors alias @mr='googler -w macrumors.com' # N # Google News alias @n='googler -N' # Nature Research alias @nature='googler -N nature.com' # NBA alias @nba='googler -N nba.com' # National Geographic alias @ng='googler -N nationalgeographic.com' # National Programme on Technology Enhanced Learning alias @nptel='googler -w nptel.ac.in' # O # MIT OpenCourseWare alias @ocw='googler -w ocw.mit.edu' # Open Embedded alias @oembedded='googler -w openembedded.org' # OMG! Ubuntu! alias @omg='googler -w omgubuntu.co.uk' # OpenSubtitles alias @op='googler -w opensubtitles.org' # Opensource.com alias @opensource='googler -w opensource.com' # Open Source Alternative alias @osalt='googler -w osalt.com' # OSDev Wiki alias @osdev='googler -w wiki.osdev.org' # OpenWrt alias @owrt='googler -w openwrt.org' # Oxford Dictionary alias @ox='googler -w en.oxforddictionaries.com' # P # Google Patents alias @patent='googler -w patents.google.com' # The Pirate Bay alias @pirate='googler -w thepiratebay.org' # Android Apps alias @play='googler -w play.google.com' # PlayOnLinux alias @playonlinux='googler -w playonlinux.com' # Python documentation alias @python='googler -w docs.python.org' # Q # Quora alias @q='googler -w quora.com' # Wikiquote alias @quotes='googler -w en.wikiquote.org' # R # Reddit alias @r='googler -w reddit.com' # Reader's Digest alias @rd='googler -w rd.com' # RFC Reader alias @rfc='googler -w rfc-editor.org' # Rpmfind alias @rpm='googler -w rpmfind.net' # Rotten Tomatoes alias @rt='googler -w rottentomatoes.com' # S # OnlineSlangDictionary alias @slang='googler -w onlineslangdictionary.com' # Stack Overflow alias @so='googler -w stackoverflow.com' # Softpedia alias @softpedia='googler -w softpedia.com' # SurceForge alias @sourceforge='googler -w sourceforge.net' # Subscene alias @ss='googler -w subscene.com' # Steam alias @st='googler -w store.steampowered.com' # T # Thesaurus.com alias @t='googler -w thesaurus.com' # TED Talks alias @ted='googler -w ted.com' # The Linux Documentation Project alias @tldp='googler -w tldp.org' # tl;drLegal alias @tldrlegal='googler -w tldrlegal.com' # Torrentz2 alias @to='googler -w torrentz2.eu' # The Pirate Bay alias @tpb='googler -w thepiratebay.org' # TuneIn alias @tunein='googler -w tunein.com' # Twitter alias @tw='googler -w twitter.com' # Twitch alias @twitch='googler -w twitch.tv' # U # Ubuntu Forums alias @ubuntuforums='googler -w ubuntuforums.org' # Ubuntu Packages alias @ubuntupackages='googler -w packages.ubuntu.com' # Ubuntu Wiki alias @uwiki='googler -w wiki.ubuntu.com' # V # Vim Wiki alias @vim='googler -w vim.org' # W # Wikipedia alias @w='googler -w en.wikipedia.org' # Walmart alias @walmart='googler -w walmart.com' # Weather.com alias @weather='googler -w weather.com' # Wikia alias @wikia='googler -w wikia.com' # X # XKCD alias @xkcd='googler -w xkcd.com' # Y # Yahoo alias @y='googler -w yahoo.com' # Yahoo Finance alias @yf='googler -w finance.yahoo.com' # YouTube alias @yt='googler -w youtube.com' # Z # ZDNet alias @zdnet='googler -w zdnet.com' googler-4.0/auto-completion/zsh/000077500000000000000000000000001356731672100167515ustar00rootroot00000000000000googler-4.0/auto-completion/zsh/_googler000066400000000000000000000052261356731672100204760ustar00rootroot00000000000000#compdef googler # # Completion definition for googler. # # Author: # Zhiming Wang # setopt localoptions noshwordsplit noksharrays _googler_query_caching_policy () { # rebuild if cache is more than a day old local -a oldp oldp=( $1(Nm+1) ) (( $#oldp )) } _googler_complete_query () { local prefix=$words[CURRENT] [[ -n $prefix && $prefix != -* ]] || return local cache_id=googler_$prefix zstyle -s :completion:${curcontext}: cache-policy update_policy [[ -z $update_policy ]] && zstyle :completion:${curcontext}: cache_policy _googler_query_caching_policy local -a completions if _cache_invalid $cache_id || ! _retrieve_cache $cache_id; then completions=( ${(f)"$(googler --complete $prefix 2>/dev/null)"} ) _store_cache $cache_id completions fi compadd $@ -- $completions } local -a args args=( '(- : *)'{-h,--help}'[show help text and exit]' '(-s --start)'{-s,--start}'[start at the Nth result]:result number' '(-n --count)'{-n,--count}'[show specified number of results (default 10)]:count' '(-N --news)'{-N,--news}'[show results from news section]' '(-c --tld)'{-c,--tld}'[country-specific search with top-level domain]:top level domain without dot' '(-l --lang)'{-l,--lang}'[display in specified language]:language code' '(-x --exact)'{-x,--exact}'[disable automatic spelling correction]' '(--colorize)--colorize[whether to colorize output]:auto/always/never' '(-C --nocolor)'{-C,--nocolor}'[disable color output]' '(--colors)--colors[set output colors]:six-letter string' '(-j --first --lucky)'{-j,--first,--lucky}'[open the first result in a web browser]' '(-t --time)'{-t,--time}'[time limit search]:period (h/d/w/m/y + number)' '(-w --site)'{-w,--site}'[search a site using Google]:domain' '(--unfilter)--unfilter[do not omit similar results]' '(-p --proxy)'{-p,--proxy}'[proxy in HOST:PORT format]:proxy details' '(--notweak)--notweak[disable TCP optimizations, forced TLS 1.2]' '(--json)--json[output in JSON format; implies --exact and --noprompt]' '(--url-handler)--url-handler[cli script or utility]:url opener' '(--show-browser-logs)--show-browser-logs[do not suppress browser output]' '(--np --noprompt)'{--np,--noprompt}'[perform search and exit, do not prompt for further interactions]' '(-u --upgrade)'{-u,--upgrade}'[perform in-place self-upgrade]' '(--include-git)--include-git[when used with --upgrade, upgrade to git master]' '(- : *)'{-v,--version}'[show version number and exit]' '(-d --debug)'{-d,--debug}'[enable debugging]' '*:::query:_googler_complete_query' ) _arguments -S -s $args googler-4.0/googler000077500000000000000000003447401356731672100144260ustar00rootroot00000000000000#!/usr/bin/env python3 # # Copyright © 2008 Henri Hakkinen # Copyright © 2015-2019 Arun Prakash Jana # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import argparse import atexit import base64 import collections import codecs import functools import gzip import html.entities import html.parser import http.client from http.client import HTTPSConnection import locale import logging import os import platform import shutil import signal import socket import ssl from subprocess import Popen, PIPE, DEVNULL import sys import textwrap import unicodedata import urllib.parse import uuid import webbrowser # Python optional dependency compatibility layer try: import readline except ImportError: pass try: import setproctitle setproctitle.setproctitle('googler') except (ImportError, Exception): pass from typing import ( Any, Dict, Generator, Iterable, Iterator, List, Match, Optional, Sequence, Tuple, Union, cast, ) # Basic setup logging.basicConfig(format='[%(levelname)s] %(message)s') logger = logging.getLogger() def sigint_handler(signum, frame): print('\nInterrupted.', file=sys.stderr) sys.exit(1) try: signal.signal(signal.SIGINT, sigint_handler) except ValueError: # signal only works in main thread pass # Constants _VERSION_ = '4.0' COLORMAP = {k: '\x1b[%sm' % v for k, v in { 'a': '30', 'b': '31', 'c': '32', 'd': '33', 'e': '34', 'f': '35', 'g': '36', 'h': '37', 'i': '90', 'j': '91', 'k': '92', 'l': '93', 'm': '94', 'n': '95', 'o': '96', 'p': '97', 'A': '30;1', 'B': '31;1', 'C': '32;1', 'D': '33;1', 'E': '34;1', 'F': '35;1', 'G': '36;1', 'H': '37;1', 'I': '90;1', 'J': '91;1', 'K': '92;1', 'L': '93;1', 'M': '94;1', 'N': '95;1', 'O': '96;1', 'P': '97;1', 'x': '0', 'X': '1', 'y': '7', 'Y': '7;1', }.items()} USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' text_browsers = ['elinks', 'links', 'lynx', 'w3m', 'www-browser'] # Self-upgrade parameters # # Downstream packagers are recommended to turn off the entire self-upgrade # mechanism through # # make disable-self-upgrade # # before running `make install'. ENABLE_SELF_UPGRADE_MECHANISM = True API_REPO_BASE = 'https://api.github.com/repos/jarun/googler' RAW_DOWNLOAD_REPO_BASE = 'https://raw.githubusercontent.com/jarun/googler' debugger = False # Monkeypatch textwrap for CJK wide characters. def monkeypatch_textwrap_for_cjk(): try: if textwrap.wrap.patched: return except AttributeError: pass psl_textwrap_wrap = textwrap.wrap def textwrap_wrap(text, width=70, **kwargs): if width <= 2: width = 2 # We first add a U+0000 after each East Asian Fullwidth or East # Asian Wide character, then fill to width - 1 (so that if a NUL # character ends up on a new line, we still have one last column # to spare for the preceding wide character). Finally we strip # all the NUL characters. # # East Asian Width: https://www.unicode.org/reports/tr11/ return [ line.replace('\0', '') for line in psl_textwrap_wrap( ''.join( ch + '\0' if unicodedata.east_asian_width(ch) in ('F', 'W') else ch for ch in unicodedata.normalize('NFC', text) ), width=width - 1, **kwargs ) ] def textwrap_fill(text, width=70, **kwargs): return '\n'.join(textwrap_wrap(text, width=width, **kwargs)) textwrap.wrap = textwrap_wrap textwrap.fill = textwrap_fill textwrap.wrap.patched = True textwrap.fill.patched = True monkeypatch_textwrap_for_cjk() CoordinateType = Tuple[int, int] class TrackedTextwrap: """ Implements a text wrapper that tracks the position of each source character, and can correctly insert zero-width sequences at given offsets of the source text. Wrapping result should be the same as that from PSL textwrap.wrap with default settings except expand_tabs=False. """ def __init__(self, text: str, width: int): self._original = text # Do the job of replace_whitespace first so that we can easily # match text to wrapped lines later. Note that this operation # does not change text length or offsets. whitespace = "\t\n\v\f\r " whitespace_trans = str.maketrans(whitespace, " " * len(whitespace)) text = text.translate(whitespace_trans) self._lines = textwrap.wrap( text, width, expand_tabs=False, replace_whitespace=False ) # self._coords track the (row, column) coordinate of each source # character in the result text. It is indexed by offset in # source text. self._coords = [] # type: List[CoordinateType] offset = 0 try: if not self._lines: # Source text only has whitespaces. We add an empty line # in order to produce meaningful coordinates. self._lines = [""] for row, line in enumerate(self._lines): assert text[offset : offset + len(line)] == line col = 0 for _ in line: self._coords.append((row, col)) offset += 1 col += 1 # All subsequent dropped whitespaces map to the last, imaginary column # (the EOL character if you wish) of the current line. while offset < len(text) and text[offset] == " ": self._coords.append((row, col)) offset += 1 # One past the final character (think of it as EOF) should # be treated as a valid offset. self._coords.append((row, col)) except AssertionError: raise RuntimeError( "TrackedTextwrap: the impossible happened at offset {} of text {!r}".format( offset, self._original ) ) # seq should be a zero-width sequence, e.g., an ANSI escape sequence. # May raise IndexError if offset is out of bounds. def insert_zero_width_sequence(self, seq: str, offset: int) -> None: row, col = self._coords[offset] line = self._lines[row] self._lines[row] = line[:col] + seq + line[col:] # Shift coordinates of all characters after the given character # on the same line. shift = len(seq) offset += 1 while offset < len(self._coords) and self._coords[offset][0] == row: _, col = self._coords[offset] self._coords[offset] = (row, col + shift) offset += 1 @property def original(self) -> str: return self._original @property def lines(self) -> List[str]: return self._lines @property def wrapped(self) -> str: return "\n".join(self._lines) # May raise IndexError if offset is out of bounds. def get_coordinate(self, offset: int) -> CoordinateType: return self._coords[offset] ### begin dim (DOM implementation with CSS support) ### ### https://github.com/zmwangx/dim/blob/master/dim.py ### import html import re import textwrap from collections import OrderedDict from enum import Enum from html.parser import HTMLParser SelectorGroupLike = Union[str, "SelectorGroup", "Selector"] class Node(object): """ Represents a DOM node. Parts of JavaScript's DOM ``Node`` API and ``Element`` API are mirrored here, with extensions. In particular, ``querySelector`` and ``querySelectorAll`` are mirrored. Notable properties and methods: :meth:`attr()`, :attr:`classes`, :attr:`html`, :attr:`text`, :meth:`ancestors()`, :meth:`descendants()`, :meth:`select()`, :meth:`select_all()`, :meth:`matched_by()`, Attributes: tag (:class:`Optional`\\[:class:`str`]) attrs (:class:`Dict`\\[:class:`str`, :class:`str`]) parent (:class:`Optional`\\[:class:`Node`]) children (:class:`List`\\[:class:`Node`]) """ # Meant to be reimplemented by subclasses. def __init__(self) -> None: self.tag = None # type: Optional[str] self.attrs = {} # type: Dict[str, str] self.parent = None # type: Optional[Node] self.children = [] # type: List[Node] # Used in DOMBuilder. self._partial = False # HTML representation of the node. Meant to be implemented by # subclasses. def __str__(self) -> str: # pragma: no cover raise NotImplementedError def select(self, selector: SelectorGroupLike) -> Optional["Node"]: """DOM ``querySelector`` clone. Returns one match (if any).""" selector = self._normalize_selector(selector) for node in self._select_all(selector): return node return None def query_selector(self, selector: SelectorGroupLike) -> Optional["Node"]: """Alias of :meth:`select`.""" return self.select(selector) def select_all(self, selector: SelectorGroupLike) -> List["Node"]: """DOM ``querySelectorAll`` clone. Returns all matches in a list.""" selector = self._normalize_selector(selector) return list(self._select_all(selector)) def query_selector_all(self, selector: SelectorGroupLike) -> List["Node"]: """Alias of :meth:`select_all`.""" return self.select_all(selector) def matched_by( self, selector: SelectorGroupLike, root: Optional["Node"] = None ) -> bool: """ Checks whether this node is matched by `selector`. See :meth:`SelectorGroup.matches()`. """ selector = self._normalize_selector(selector) return selector.matches(self, root=root) @staticmethod def _normalize_selector(selector: SelectorGroupLike) -> "SelectorGroup": if isinstance(selector, str): return SelectorGroup.from_str(selector) if isinstance(selector, SelectorGroup): return selector if isinstance(selector, Selector): return SelectorGroup([selector]) raise ValueError("not a selector or group of selectors: %s" % repr(selector)) def _select_all(self, selector: "SelectorGroup") -> Generator["Node", None, None]: for descendant in self.descendants(): if selector.matches(descendant, root=self): yield descendant def child_nodes(self) -> List["Node"]: return self.children def first_child(self) -> Optional["Node"]: if self.children: return self.children[0] else: return None def first_element_child(self) -> Optional["Node"]: for child in self.children: if isinstance(child, ElementNode): return child return None def last_child(self) -> Optional["Node"]: if self.children: return self.children[-1] else: return None def last_element_child(self) -> Optional["Node"]: for child in reversed(self.children): if isinstance(child, ElementNode): return child return None def next_sibling(self) -> Optional["Node"]: """.. note:: Not O(1), use with caution.""" next_siblings = self.next_siblings() if next_siblings: return next_siblings[0] else: return None def next_siblings(self) -> List["Node"]: parent = self.parent if not parent: return [] try: index = parent.children.index(self) return parent.children[index + 1 :] except ValueError: # pragma: no cover raise ValueError("node is not found in children of its parent") def next_element_sibling(self) -> Optional["ElementNode"]: """.. note:: Not O(1), use with caution.""" for sibling in self.next_siblings(): if isinstance(sibling, ElementNode): return sibling return None def previous_sibling(self) -> Optional["Node"]: """.. note:: Not O(1), use with caution.""" previous_siblings = self.previous_siblings() if previous_siblings: return previous_siblings[0] else: return None def previous_siblings(self) -> List["Node"]: """ Compared to the natural DOM order, the order of returned nodes are reversed. That is, the adjacent sibling (if any) is the first in the returned list. """ parent = self.parent if not parent: return [] try: index = parent.children.index(self) if index > 0: return parent.children[index - 1 :: -1] else: return [] except ValueError: # pragma: no cover raise ValueError("node is not found in children of its parent") def previous_element_sibling(self) -> Optional["ElementNode"]: """.. note:: Not O(1), use with caution.""" for sibling in self.previous_siblings(): if isinstance(sibling, ElementNode): return sibling return None def ancestors( self, *, root: Optional["Node"] = None ) -> Generator["Node", None, None]: """ Ancestors are generated in reverse order of depth, stopping at `root`. A :class:`RuntimeException` is raised if `root` is not in the ancestral chain. """ if self is root: return ancestor = self.parent while ancestor is not root: if ancestor is None: raise RuntimeError("provided root node not found in ancestral chain") yield ancestor ancestor = ancestor.parent if root: yield root def descendants(self) -> Generator["Node", None, None]: """Descendants are generated in depth-first order.""" for child in self.children: yield child yield from child.descendants() def attr(self, attr: str) -> Optional[str]: """Returns the attribute if it exists on the node, otherwise ``None``.""" return self.attrs.get(attr) @property def html(self) -> str: """ HTML representation of the node. (For a :class:`TextNode`, :meth:`html` returns the escaped version of the text. """ return str(self) def outer_html(self) -> str: """Alias of :attr:`html`.""" return self.html def inner_html(self) -> str: """HTML representation of the node's children.""" return "".join(child.html for child in self.children) @property def text(self) -> str: # pragma: no cover """This property is expected to be implemented by subclasses.""" raise NotImplementedError def text_content(self) -> str: """Alias of :attr:`text`.""" return self.text @property def classes(self) -> List[str]: return self.attrs.get("class", "").split() def class_list(self) -> List[str]: return self.classes class ElementNode(Node): """ Represents an element node. Note that tag and attribute names are case-insensitive; attribute values are case-sensitive. """ def __init__( self, tag: str, attrs: Iterable[Tuple[str, Optional[str]]], *, parent: Optional["Node"] = None, children: Optional[Sequence["Node"]] = None ) -> None: Node.__init__(self) self.tag = tag.lower() # type: str self.attrs = OrderedDict((attr.lower(), val or "") for attr, val in attrs) self.parent = parent self.children = list(children or []) def __repr__(self) -> str: s = "<" + self.tag if self.attrs: s += " attrs=%s" % repr(list(self.attrs.items())) if self.children: s += " children=%s" % repr(self.children) s += ">" return s # https://ipython.org/ipython-doc/3/api/generated/IPython.lib.pretty.html def _repr_pretty_(self, p: Any, cycle: bool) -> None: # pragma: no cover if cycle: raise RuntimeError("cycle detected in DOM tree") p.text("<\x1b[1m%s\x1b[0m" % self.tag) if self.attrs: p.text(" attrs=%s" % repr(list(self.attrs.items()))) if self.children: p.text(" children=[") if len(self.children) == 1 and isinstance(self.first_child(), TextNode): p.text("\x1b[4m%s\x1b[0m" % repr(self.first_child())) else: with p.indent(2): for child in self.children: p.break_() if hasattr(child, "_repr_pretty_"): child._repr_pretty_(p, False) # type: ignore else: p.text("\x1b[4m%s\x1b[0m" % repr(child)) p.text(",") p.break_() p.text("]") p.text(">") def __str__(self) -> str: """HTML representation of the node.""" s = "<" + self.tag for attr, val in self.attrs.items(): s += ' %s="%s"' % (attr, html.escape(val)) if self.children: s += ">" s += "".join(str(child) for child in self.children) s += "" % self.tag else: if _tag_is_void(self.tag): s += "/>" else: s += ">" % self.tag return s @property def text(self) -> str: """The concatenation of all descendant text nodes.""" return "".join(child.text for child in self.children) class TextNode(str, Node): """ Represents a text node. Subclasses :class:`Node` and :class:`str`. """ def __new__(cls, text: str) -> "TextNode": s = str.__new__(cls, text) # type: ignore s.parent = None return s # type: ignore def __init__(self, text: str) -> None: Node.__init__(self) def __repr__(self) -> str: return "<%s>" % str.__repr__(self) # HTML-escaped form of the text node. use text() for unescaped # version. def __str__(self) -> str: return html.escape(self) def __eq__(self, other: object) -> bool: """ Two text nodes are equal if and only if they are the same node. For string comparision, use :attr:`text`. """ return self is other def __ne__(self, other: object) -> bool: """ Two text nodes are non-equal if they are not the same node. For string comparision, use :attr:`text`. """ return self is not other @property def text(self) -> str: return str.__str__(self) class DOMBuilderException(Exception): """ Exception raised when :class:`DOMBuilder` detects a bad state. Attributes: pos (:class:`Tuple`\\[:class:`int`, :class:`int`]): Line number and offset in HTML input. why (:class:`str`): Reason of the exception. """ def __init__(self, pos: Tuple[int, int], why: str) -> None: self.pos = pos self.why = why def __str__(self) -> str: # pragma: no cover return "DOM builder aborted at %d:%d: %s" % (self.pos[0], self.pos[1], self.why) class DOMBuilder(HTMLParser): """ HTML parser / DOM builder. Subclasses :class:`html.parser.HTMLParser`. Consume HTML and builds a :class:`Node` tree. Once finished, use :attr:`root` to access the root of the tree. This parser cannot parse malformed HTML with tag mismatch. """ def __init__(self) -> None: super().__init__(convert_charrefs=True) self._stack = [] # type: List[Node] def handle_starttag( self, tag: str, attrs: Sequence[Tuple[str, Optional[str]]] ) -> None: node = ElementNode(tag, attrs) node._partial = True self._stack.append(node) # For void elements, immediately invoke the end tag handler (see # handle_startendtag()). if _tag_is_void(tag): self.handle_endtag(tag) def handle_endtag(self, tag: str) -> None: tag = tag.lower() children = [] while self._stack and not self._stack[-1]._partial: children.append(self._stack.pop()) if not self._stack: raise DOMBuilderException(self.getpos(), "extra end tag: %s" % repr(tag)) parent = self._stack[-1] if parent.tag != tag: raise DOMBuilderException( self.getpos(), "expecting end tag %s, got %s" % (repr(parent.tag), repr(tag)), ) parent.children = list(reversed(children)) parent._partial = False for child in children: child.parent = parent # Make parser behavior for explicitly and implicitly void elements # (e.g.,
vs
) consistent. The former triggers # handle_starttag only, whereas the latter triggers # handle_startendtag (which by default triggers both handle_starttag # and handle_endtag). See https://www.bugs.python.org/issue25258. def handle_startendtag( self, tag: str, attrs: Sequence[Tuple[str, Optional[str]]] ) -> None: self.handle_starttag(tag, attrs) def handle_data(self, text: str) -> None: if not self._stack: # Ignore text nodes before the first tag. return self._stack.append(TextNode(text)) @property def root(self) -> "Node": """ Finishes processing and returns the root node. Raises :class:`DOMBuilderException` if there is no root tag or root tag is not closed yet. """ if not self._stack: raise DOMBuilderException(self.getpos(), "no root tag") if self._stack[0]._partial: raise DOMBuilderException(self.getpos(), "root tag not closed yet") return self._stack[0] def parse_html(html: str, *, ParserClass: type = DOMBuilder) -> "Node": """ Parses HTML string, builds DOM, and returns root node. The parser may raise :class:`DOMBuilderException`. Args: html: input HTML string ParserClass: :class:`DOMBuilder` or a subclass Returns: Root note of the parsed tree. If the HTML string contains multiple top-level elements, only the first is returned and the rest are lost. """ builder = ParserClass() # type: DOMBuilder builder.feed(html) builder.close() return builder.root class SelectorParserException(Exception): """ Exception raised when the selector parser fails to parse an input. Attributes: s (:class:`str`): The input string to be parsed. cursor (:class:`int`): Cursor position where the failure occurred. why (:class:`str`): Reason of the failure. """ def __init__(self, s: str, cursor: int, why: str) -> None: self.s = s self.cursor = cursor self.why = why def __str__(self) -> str: # pragma: no cover return "selector parser aborted at character %d of %s: %s" % ( self.cursor, repr(self.s), self.why, ) class SelectorGroup: """ Represents a group of CSS selectors. A group of CSS selectors is simply a comma-separated list of selectors. [#]_ See :class:`Selector` documentation for the scope of support. Typically, a :class:`SelectorGroup` is constructed from a string (e.g., ``th.center, td.center``) using the factory function :meth:`from_str`. .. [#] https://www.w3.org/TR/selectors-3/#grouping """ def __init__(self, selectors: Iterable["Selector"]) -> None: self._selectors = list(selectors) def __repr__(self) -> str: return "" % repr(str(self)) def __str__(self) -> str: return ", ".join(str(selector) for selector in self._selectors) def __len__(self) -> int: return len(self._selectors) def __getitem__(self, index: int) -> "Selector": return self._selectors[index] def __iter__(self) -> Iterator["Selector"]: return iter(self._selectors) @classmethod def from_str(cls, s: str) -> "SelectorGroup": """ Parses input string into a group of selectors. :class:`SelectorParserException` is raised on invalid input. See :class:`Selector` documentation for the scope of support. Args: s: input string Returns: Parsed group of selectors. """ i = 0 selectors = [] while i < len(s): selector, i = Selector.from_str(s, i) selectors.append(selector) if not selectors: raise SelectorParserException(s, i, "selector group is empty") return cls(selectors) def matches(self, node: "Node", root: Optional["Node"] = None) -> bool: """ Decides whether the group of selectors matches `node`. The group of selectors matches `node` as long as one of the selectors matches `node`. If `root` is provided and child and/or descendant combinators are involved, parent/ancestor lookup terminates at `root`. """ return any(selector.matches(node, root=root) for selector in self) class Selector: """ Represents a CSS selector. Recall that a CSS selector is a chain of one or more *sequences of simple selectors* separated by *combinators*. [#selectors-3]_ This concept is represented as a cons list of sequences of simple selectors (in right to left order). This class in fact holds a single sequence, with an optional combinator and reference to the previous sequence. For instance, ``main#main p.important.definition > a.term[id][href]`` would be parsed into (schematically) the following structure:: ">" tag='a' classes=('term') attrs=([id], [href]) ~> " " tag='p' classes=('important', 'definition') ~> tag='main' id='main' Each line is held in a separate instance of :class:`Selector`, linked together by the :attr:`previous` attribute. Supported grammar (from selectors level 3 [#selectors-3]_): - Type selectors; - Universal selectors; - Class selectors; - ID selectors; - Attribute selectors; - Combinators. Unsupported grammar: - Pseudo-classes; - Pseudo-elements; - Namespace prefixes (``ns|``, ``*|``, ``|``) in any part of any selector. Rationale: - Pseudo-classes have too many variants, a few of which even complete with an admittedly not-so-complex minilanguage. These add up to a lot of code. - Pseudo-elements are useless outside rendering contexts, hence out of scope. - Namespace support is too niche to be worth the parsing headache. *Using namespace prefixes may confuse the parser!* Note that the parser only loosely follows the spec and priotizes ease of parsing (which includes readability and *writability* of regexes), so some invalid selectors may be accepted (in fact, false positives abound, but accepting valid inputs is a much more important goal than rejecting invalid inputs for this library), and some valid selectors may be rejected (but as long as you stick to the scope outlined above and common sense you should be fine; the false negatives shouldn't be used by actual human beings anyway). In particular, whitespace character is simplified to ``\\s`` (ASCII mode) despite CSS spec not counting U+000B (VT) as whitespace, identifiers are simplified to ``[\\w-]+`` (ASCII mode), and strings (attribute selector values can be either identifiers or strings) allow escaped quotes (i.e., ``\\'`` inside single-quoted strings and ``\\"`` inside double-quoted strings) but everything else is interpreted literally. The exact specs for CSS identifiers and strings can be found at [#]_. Certain selectors and combinators may be implemented in the parser but not implemented in matching and/or selection APIs. .. [#selectors-3] https://www.w3.org/TR/selectors-3/ .. [#] https://www.w3.org/TR/CSS21/syndata.html Attributes: tag (:class:`Optional`\\[:class:`str`]): Type selector. classes (:class:`List`\\[:class:`str`]): Class selectors. id (:class:`Optional`\\[:class:`str`]): ID selector. attrs (:class:`List`\\[:class:`AttributeSelector`]): Attribute selectors. combinator (:class:`Optional`\\[:class:`Combinator`]): Combinator with the previous sequence of simple selectors in chain. previous (:class:`Optional`\\[:class:`Selector`]): Reference to the previous sequence of simple selectors in chain. """ def __init__( self, *, tag: Optional[str] = None, classes: Optional[Sequence[str]] = None, id: Optional[str] = None, attrs: Optional[Sequence["AttributeSelector"]] = None, combinator: Optional["Combinator"] = None, previous: Optional["Selector"] = None ) -> None: self.tag = tag.lower() if tag else None self.classes = list(classes or []) self.id = id self.attrs = list(attrs or []) self.combinator = combinator self.previous = previous def __repr__(self) -> str: return "" % repr(str(self)) def __str__(self) -> str: sequences = [] delimiters = [] seq = self while True: sequences.append(seq._sequence_str_()) if seq.previous: if seq.combinator == Combinator.DESCENDANT: delimiters.append(" ") elif seq.combinator == Combinator.CHILD: delimiters.append(" > ") elif seq.combinator == Combinator.NEXT_SIBLING: delimiters.append(" + ") elif seq.combinator == Combinator.SUBSEQUENT_SIBLING: delimiters.append(" ~ ") else: # pragma: no cover raise RuntimeError( "unimplemented combinator: %s" % repr(self.combinator) ) seq = seq.previous else: delimiters.append("") break return "".join( delimiter + sequence for delimiter, sequence in zip(reversed(delimiters), reversed(sequences)) ) # Format a single sequence of simple selectors, without combinator. def _sequence_str_(self) -> str: s = "" if self.tag: s += self.tag if self.classes: s += "".join(".%s" % class_ for class_ in self.classes) if self.id: s += "#%s" % self.id if self.attrs: s += "".join(str(attr) for attr in self.attrs) return s if s else "*" @classmethod def from_str(cls, s: str, cursor: int = 0) -> Tuple["Selector", int]: """ Parses input string into selector. This factory function only parses out one selector (up to a comma or EOS), so partial consumption is allowed --- an optional `cursor` is taken as input (0 by default) and the moved cursor (either after the comma or at EOS) is returned as part of the output. :class:`SelectorParserException` is raised on invalid input. See :class:`Selector` documentation for the scope of support. If you need to completely consume a string representing (potentially) a group of selectors, use :meth:`SelectorGroup.from_str()`. Args: s: input string cursor: initial cursor position on `s` Returns: A tuple containing the parsed selector and the moved the cursor (either after a comma-delimiter, or at EOS). """ # Simple selectors. TYPE_SEL = re.compile(r"[\w-]+", re.A) UNIVERSAL_SEL = re.compile(r"\*") ATTR_SEL = re.compile( r"""\[ \s*(?P[\w-]+)\s* ( (?P[~|^$*]?=)\s* ( (?P[\w-]+)| (?P (?P['"]) (?P.*?) (?\s*") NEXT_SIB_COM = re.compile(r"\s*\+\s*") SUB_SIB_COM = re.compile(r"\s*~\s*") # Misc WHITESPACE = re.compile(r"\s*") END_OF_SELECTOR = re.compile(r"\s*($|,)") tag = None classes = [] id = None attrs = [] combinator = None selector = None previous_combinator = None i = cursor # Skip leading whitespace m = WHITESPACE.match(s, i) if m: i = m.end() while i < len(s): # Parse one simple selector. # # PEP 572 (assignment expressions; the one that burned Guido # so much that he resigned as BDFL) would have been nice; it # would have saved us from all the regex match # reassignments, and worse still, the casts, since mypy # complains about getting Optional[Match[str]] instead of # Match[str]. if TYPE_SEL.match(s, i): if tag: raise SelectorParserException(s, i, "multiple type selectors found") m = cast(Match[str], TYPE_SEL.match(s, i)) tag = m.group() elif UNIVERSAL_SEL.match(s, i): m = cast(Match[str], UNIVERSAL_SEL.match(s, i)) elif ATTR_SEL.match(s, i): m = cast(Match[str], ATTR_SEL.match(s, i)) attr = m.group("attr") op = m.group("op") val_identifier = m.group("val_identifier") quote = m.group("quote") val_string_inner = m.group("val_string_inner") if val_identifier is not None: val = val_identifier elif val_string_inner is not None: val = val_string_inner.replace("\\" + quote, quote) else: val = None if op is None: type = AttributeSelectorType.BARE elif op == "=": type = AttributeSelectorType.EQUAL elif op == "~=": type = AttributeSelectorType.TILDE elif op == "|=": type = AttributeSelectorType.PIPE elif op == "^=": type = AttributeSelectorType.CARET elif op == "$=": type = AttributeSelectorType.DOLLAR elif op == "*=": type = AttributeSelectorType.ASTERISK else: # pragma: no cover raise SelectorParserException( s, i, "unrecognized operator %s in attribute selector" % repr(op), ) attrs.append(AttributeSelector(attr, val, type)) elif CLASS_SEL.match(s, i): m = cast(Match[str], CLASS_SEL.match(s, i)) classes.append(m.group(1)) elif ID_SEL.match(s, i): if id: raise SelectorParserException(s, i, "multiple id selectors found") m = cast(Match[str], ID_SEL.match(s, i)) id = m.group(1) elif PSEUDO_CLASS_SEL.match(s, i): raise SelectorParserException(s, i, "pseudo-classes not supported") elif PSEUDO_ELEM_SEL.match(s, i): raise SelectorParserException(s, i, "pseudo-elements not supported") else: raise SelectorParserException( s, i, "expecting simple selector, found none" ) i = m.end() # Try to parse a combinator, or end the selector. if CHILD_COM.match(s, i): m = cast(Match[str], CHILD_COM.match(s, i)) combinator = Combinator.CHILD elif NEXT_SIB_COM.match(s, i): m = cast(Match[str], NEXT_SIB_COM.match(s, i)) combinator = Combinator.NEXT_SIBLING elif SUB_SIB_COM.match(s, i): m = cast(Match[str], SUB_SIB_COM.match(s, i)) combinator = Combinator.SUBSEQUENT_SIBLING elif END_OF_SELECTOR.match(s, i): m = cast(Match[str], END_OF_SELECTOR.match(s, i)) combinator = None # Need to parse descendant combinator at the very end # because it could be a prefix to all previous cases. elif DESCENDANT_COM.match(s, i): m = cast(Match[str], DESCENDANT_COM.match(s, i)) combinator = Combinator.DESCENDANT else: continue i = m.end() if combinator and i == len(s): raise SelectorParserException(s, i, "unexpected end at combinator") selector = cls( tag=tag, classes=classes, id=id, attrs=attrs, combinator=previous_combinator, previous=selector, ) previous_combinator = combinator # End of selector. if combinator is None: break tag = None classes = [] id = None attrs = [] combinator = None if not selector: raise SelectorParserException(s, i, "selector is empty") return selector, i def matches(self, node: "Node", root: Optional["Node"] = None) -> bool: """ Decides whether the selector matches `node`. Each sequence of simple selectors in the selector's chain must be matched for a positive. If `root` is provided and child and/or descendant combinators are involved, parent/ancestor lookup terminates at `root`. """ if self.tag: if not node.tag or node.tag != self.tag: return False if self.id: if node.attrs.get("id") != self.id: return False if self.classes: classes = node.classes for class_ in self.classes: if class_ not in classes: return False if self.attrs: for attr_selector in self.attrs: if not attr_selector.matches(node): return False if not self.previous: return True if self.combinator == Combinator.DESCENDANT: return any( self.previous.matches(ancestor, root=root) for ancestor in node.ancestors() ) elif self.combinator == Combinator.CHILD: if node is root or node.parent is None: return False else: return self.previous.matches(node.parent) elif self.combinator == Combinator.NEXT_SIBLING: sibling = node.previous_element_sibling() if not sibling: return False else: return self.previous.matches(sibling) elif self.combinator == Combinator.SUBSEQUENT_SIBLING: return any( self.previous.matches(sibling, root=root) for sibling in node.previous_siblings() if isinstance(sibling, ElementNode) ) else: # pragma: no cover raise RuntimeError("unimplemented combinator: %s" % repr(self.combinator)) class AttributeSelector: """ Represents an attribute selector. Attributes: attr (:class:`str`) val (:class:`Optional`\\[:class:`str`]) type (:class:`AttributeSelectorType`) """ def __init__( self, attr: str, val: Optional[str], type: "AttributeSelectorType" ) -> None: self.attr = attr.lower() self.val = val self.type = type def __repr__(self) -> str: return "" % repr(str(self)) def __str__(self) -> str: if self.type == AttributeSelectorType.BARE: fmt = "[{attr}{val:.0}]" elif self.type == AttributeSelectorType.EQUAL: fmt = "[{attr}={val}]" elif self.type == AttributeSelectorType.TILDE: fmt = "[{attr}~={val}]" elif self.type == AttributeSelectorType.PIPE: fmt = "[{attr}|={val}]" elif self.type == AttributeSelectorType.CARET: fmt = "[{attr}^={val}]" elif self.type == AttributeSelectorType.DOLLAR: fmt = "[{attr}$={val}]" elif self.type == AttributeSelectorType.ASTERISK: fmt = "[{attr}*={val}]" return fmt.format(attr=self.attr, val=repr(self.val)) def matches(self, node: "Node") -> bool: val = node.attrs.get(self.attr) if val is None: return False if self.type == AttributeSelectorType.BARE: return True elif self.type == AttributeSelectorType.EQUAL: return val == self.val elif self.type == AttributeSelectorType.TILDE: return self.val in val.split() elif self.type == AttributeSelectorType.PIPE: return val == self.val or val.startswith("%s-" % self.val) elif self.type == AttributeSelectorType.CARET: return bool(self.val and val.startswith(self.val)) elif self.type == AttributeSelectorType.DOLLAR: return bool(self.val and val.endswith(self.val)) elif self.type == AttributeSelectorType.ASTERISK: return bool(self.val and self.val in val) else: # pragma: no cover raise RuntimeError("unimplemented attribute selector: %s" % repr(self.type)) # Enum: basis for poor man's algebraic data type. class AttributeSelectorType(Enum): """ Attribute selector types. Members correspond to the following forms of attribute selector: - :attr:`BARE`: ``[attr]``; - :attr:`EQUAL`: ``[attr=val]``; - :attr:`TILDE`: ``[attr~=val]``; - :attr:`PIPE`: ``[attr|=val]``; - :attr:`CARET`: ``[attr^=val]``; - :attr:`DOLLAR`: ``[attr$=val]``; - :attr:`ASTERISK`: ``[attr*=val]``. """ # [attr] BARE = 1 # [attr=val] EQUAL = 2 # [attr~=val] TILDE = 3 # [attr|=val] PIPE = 4 # [attr^=val] CARET = 5 # [attr$=val] DOLLAR = 6 # [attr*=val] ASTERISK = 7 class Combinator(Enum): """ Combinator types. Members correspond to the following combinators: - :attr:`DESCENDANT`: ``A B``; - :attr:`CHILD`: ``A > B``; - :attr:`NEXT_SIBLING`: ``A + B``; - :attr:`SUBSEQUENT_SIBLING`: ``A ~ B``. """ # ' ' DESCENDANT = 1 # > CHILD = 2 # + NEXT_SIBLING = 3 # ~ SUBSEQUENT_SIBLING = 4 def _tag_is_void(tag: str) -> bool: """ Checks whether the tag corresponds to a void element. https://www.w3.org/TR/html5/syntax.html#void-elements https://html.spec.whatwg.org/multipage/syntax.html#void-elements """ return tag.lower() in ( "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr", ) ### end dim ### # Global helper functions def open_url(url): """Open an URL in the user's default web browser. The string attribute ``open_url.url_handler`` can be used to open URLs in a custom CLI script or utility. A subprocess is spawned with url as the parameter in this case instead of the usual webbrowser.open() call. Whether the browser's output (both stdout and stderr) are suppressed depends on the boolean attribute ``open_url.suppress_browser_output``. If the attribute is not set upon a call, set it to a default value, which means False if BROWSER is set to a known text-based browser -- elinks, links, lynx, w3m or 'www-browser'; or True otherwise. The string attribute ``open_url.override_text_browser`` can be used to ignore env var BROWSER as well as some known text-based browsers and attempt to open url in a GUI browser available. Note: If a GUI browser is indeed found, this option ignores the program option `show-browser-logs` """ logger.debug('Opening %s', url) # Custom URL handler gets max priority if hasattr(open_url, 'url_handler'): p = Popen([open_url.url_handler, url], stdin=PIPE) p.communicate() return browser = webbrowser.get() if open_url.override_text_browser: browser_output = open_url.suppress_browser_output for name in [b for b in webbrowser._tryorder if b not in text_browsers]: browser = webbrowser.get(name) logger.debug(browser) # Found a GUI browser, suppress browser output open_url.suppress_browser_output = True break if open_url.suppress_browser_output: _stderr = os.dup(2) os.close(2) _stdout = os.dup(1) os.close(1) fd = os.open(os.devnull, os.O_RDWR) os.dup2(fd, 2) os.dup2(fd, 1) try: browser.open(url, new=2) finally: if open_url.suppress_browser_output: os.close(fd) os.dup2(_stderr, 2) os.dup2(_stdout, 1) if open_url.override_text_browser: open_url.suppress_browser_output = browser_output def printerr(msg): """Print message, verbatim, to stderr. ``msg`` could be any stringifiable value. """ print(msg, file=sys.stderr) def unwrap(text): """Unwrap text.""" lines = text.split('\n') result = '' for i in range(len(lines) - 1): result += lines[i] if not lines[i]: # Paragraph break result += '\n\n' elif lines[i + 1]: # Next line is not paragraph break, add space result += ' ' # Handle last line result += lines[-1] if lines[-1] else '\n' return result def check_stdout_encoding(): """Make sure stdout encoding is utf-8. If not, print error message and instructions, then exit with status 1. This function is a no-op on win32 because encoding on win32 is messy, and let's just hope for the best. /s """ if sys.platform == 'win32': return # Use codecs.lookup to resolve text encoding alias encoding = codecs.lookup(sys.stdout.encoding).name if encoding != 'utf-8': locale_lang, locale_encoding = locale.getlocale() if locale_lang is None: locale_lang = '' if locale_encoding is None: locale_encoding = '' ioencoding = os.getenv('PYTHONIOENCODING', 'not set') sys.stderr.write(unwrap(textwrap.dedent("""\ stdout encoding '{encoding}' detected. googler requires utf-8 to work properly. The wrong encoding may be due to a non-UTF-8 locale or an improper PYTHONIOENCODING. (For the record, your locale language is {locale_lang} and locale encoding is {locale_encoding}; your PYTHONIOENCODING is {ioencoding}.) Please set a UTF-8 locale (e.g., en_US.UTF-8) or set PYTHONIOENCODING to utf-8. """.format( encoding=encoding, locale_lang=locale_lang, locale_encoding=locale_encoding, ioencoding=ioencoding, )))) sys.exit(1) # Classes class HardenedHTTPSConnection(HTTPSConnection): """Overrides HTTPSConnection.connect to specify TLS version NOTE: TLS 1.2 is supported from Python 3.4 """ def __init__(self, host, **kwargs): HTTPSConnection.__init__(self, host, **kwargs) def connect(self, notweak=False): sock = socket.create_connection((self.host, self.port), self.timeout, self.source_address) # Optimizations not available on OS X if not notweak and sys.platform.startswith('linux'): try: sock.setsockopt(socket.SOL_TCP, socket.TCP_DEFER_ACCEPT, 1) sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_QUICKACK, 1) sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 524288) except OSError: # Doesn't work on Windows' Linux subsystem (#179) logger.debug('setsockopt failed') if getattr(self, '_tunnel_host', None): self.sock = sock elif not notweak: # Try to use TLS 1.2 ssl_context = None if hasattr(ssl, 'PROTOCOL_TLS'): # Since Python 3.5.3 ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS) if hasattr(ssl_context, "minimum_version"): # Python 3.7 with OpenSSL 1.1.0g or later ssl_context.minimum_version = ssl.TLSVersion.TLSv1_2 else: ssl_context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 | ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1) elif hasattr(ssl, 'PROTOCOL_TLSv1_2'): # Since Python 3.4 ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2) if ssl_context: self.sock = ssl_context.wrap_socket(sock) return # Fallback HTTPSConnection.connect(self) class GoogleUrl(object): """ This class constructs the Google Search/News URL. This class is modelled on urllib.parse.ParseResult for familiarity, which means it supports reading of all six attributes -- scheme, netloc, path, params, query, fragment -- of urllib.parse.ParseResult, as well as the geturl() method. However, the attributes (properties) and methods listed below should be the preferred methods of access to this class. Parameters ---------- opts : dict or argparse.Namespace, optional See the ``opts`` parameter of `update`. Other Parameters ---------------- See "Other Parameters" of `update`. Attributes ---------- hostname : str Read-write property. keywords : str or list of strs Read-write property. news : bool Read-only property. url : str Read-only property. Methods ------- full() relative() update(opts=None, **kwargs) set_queries(**kwargs) unset_queries(*args) next_page() prev_page() first_page() """ def __init__(self, opts=None, **kwargs): self.scheme = 'https' # self.netloc is a calculated property self.path = '/search' self.params = '' # self.query is a calculated property self.fragment = '' self._tld = None self._num = 10 self._start = 0 self._keywords = [] self._sites = None self._query_dict = { 'ie': 'UTF-8', 'oe': 'UTF-8', #'gbv': '1', # control the presence of javascript on the page, 1=no js, 2=js 'sei': base64.encodebytes(uuid.uuid1().bytes).decode("ascii").rstrip('=\n').replace('/', '_'), } self.update(opts, **kwargs) def __str__(self): return self.url @property def url(self): """The full Google URL you want.""" return self.full() @property def hostname(self): """The hostname.""" return self.netloc @hostname.setter def hostname(self, hostname): self.netloc = hostname @property def keywords(self): """The keywords, either a str or a list of strs.""" return self._keywords @keywords.setter def keywords(self, keywords): self._keywords = keywords @property def news(self): """Whether the URL is for Google News.""" return 'tbm' in self._query_dict and self._query_dict['tbm'] == 'nws' def full(self): """Return the full URL. Returns ------- str """ url = (self.scheme + ':') if self.scheme else '' url += '//' + self.netloc + self.relative() return url def relative(self): """Return the relative URL (without scheme and authority). Authority (see RFC 3986 section 3.2), or netloc in the terminology of urllib.parse, basically means the hostname here. The relative URL is good for making HTTP(S) requests to a known host. Returns ------- str """ rel = self.path if self.params: rel += ';' + self.params if self.query: rel += '?' + self.query if self.fragment: rel += '#' + self.fragment return rel def update(self, opts=None, **kwargs): """Update the URL with the given options. Parameters ---------- opts : dict or argparse.Namespace, optional Carries options that affect the Google Search/News URL. The list of currently recognized option keys with expected value types: duration: str (GooglerArgumentParser.is_duration) exact: bool keywords: str or list of strs lang: str news: bool num: int site: str start: int tld: str unfilter: bool Other Parameters ---------------- kwargs The `kwargs` dict extends `opts`, that is, options can be specified either way, in `opts` or as individual keyword arguments. """ if opts is None: opts = {} if hasattr(opts, '__dict__'): opts = opts.__dict__ opts.update(kwargs) qd = self._query_dict if 'duration' in opts and opts['duration']: qd['tbs'] = 'qdr:%s' % opts['duration'] if 'exact' in opts: if opts['exact']: qd['nfpr'] = 1 else: qd.pop('nfpr', None) if 'keywords' in opts: self._keywords = opts['keywords'] if 'lang' in opts and opts['lang']: qd['hl'] = opts['lang'] if 'news' in opts: if opts['news']: qd['tbm'] = 'nws' else: qd.pop('tbm', None) if 'num' in opts: self._num = opts['num'] if 'sites' in opts: self._sites = opts['sites'] if 'start' in opts: self._start = opts['start'] if 'tld' in opts: self._tld = opts['tld'] if 'unfilter' in opts and opts['unfilter']: qd['filter'] = 0 def set_queries(self, **kwargs): """Forcefully set queries outside the normal `update` mechanism. Other Parameters ---------------- kwargs Arbitrary key value pairs to be set in the query string. All keys and values should be stringifiable. Note that certain keys, e.g., ``q``, have their values constructed on the fly, so setting those has no actual effect. """ for k, v in kwargs.items(): self._query_dict[k] = v def unset_queries(self, *args): """Forcefully unset queries outside the normal `update` mechanism. Other Parameters ---------------- args Arbitrary keys to be unset. No exception is raised if a key does not exist in the first place. Note that certain keys, e.g., ``q``, are always included in the resulting URL, so unsetting those has no actual effect. """ for k in args: self._query_dict.pop(k, None) def next_page(self): """Navigate to the next page.""" self._start += self._num def prev_page(self): """Navigate to the previous page. Raises ------ ValueError If already at the first page (``start=0`` in the current query string). """ if self._start == 0: raise ValueError('Already at the first page.') self._start = (self._start - self._num) if self._start > self._num else 0 def first_page(self): """Navigate to the first page. Raises ------ ValueError If already at the first page (``start=0`` in the current query string). """ if self._start == 0: raise ValueError('Already at the first page.') self._start = 0 # Data source: https://web.archive.org/web/20170615200243/https://en.wikipedia.org/wiki/List_of_Google_domains # Scraper script: https://gist.github.com/zmwangx/b976e83c14552fe18b71 TLD_TO_DOMAIN_MAP = { 'ac': 'google.ac', 'ad': 'google.ad', 'ae': 'google.ae', 'af': 'google.com.af', 'ag': 'google.com.ag', 'ai': 'google.com.ai', 'al': 'google.al', 'am': 'google.am', 'ao': 'google.co.ao', 'ar': 'google.com.ar', 'as': 'google.as', 'at': 'google.at', 'au': 'google.com.au', 'az': 'google.az', 'ba': 'google.ba', 'bd': 'google.com.bd', 'be': 'google.be', 'bf': 'google.bf', 'bg': 'google.bg', 'bh': 'google.com.bh', 'bi': 'google.bi', 'bj': 'google.bj', 'bn': 'google.com.bn', 'bo': 'google.com.bo', 'br': 'google.com.br', 'bs': 'google.bs', 'bt': 'google.bt', 'bw': 'google.co.bw', 'by': 'google.by', 'bz': 'google.com.bz', 'ca': 'google.ca', 'cat': 'google.cat', 'cc': 'google.cc', 'cd': 'google.cd', 'cf': 'google.cf', 'cg': 'google.cg', 'ch': 'google.ch', 'ci': 'google.ci', 'ck': 'google.co.ck', 'cl': 'google.cl', 'cm': 'google.cm', 'cn': 'google.cn', 'co': 'google.com.co', 'cr': 'google.co.cr', 'cu': 'google.com.cu', 'cv': 'google.cv', 'cy': 'google.com.cy', 'cz': 'google.cz', 'de': 'google.de', 'dj': 'google.dj', 'dk': 'google.dk', 'dm': 'google.dm', 'do': 'google.com.do', 'dz': 'google.dz', 'ec': 'google.com.ec', 'ee': 'google.ee', 'eg': 'google.com.eg', 'es': 'google.es', 'et': 'google.com.et', 'fi': 'google.fi', 'fj': 'google.com.fj', 'fm': 'google.fm', 'fr': 'google.fr', 'ga': 'google.ga', 'ge': 'google.ge', 'gf': 'google.gf', 'gg': 'google.gg', 'gh': 'google.com.gh', 'gi': 'google.com.gi', 'gl': 'google.gl', 'gm': 'google.gm', 'gp': 'google.gp', 'gr': 'google.gr', 'gt': 'google.com.gt', 'gy': 'google.gy', 'hk': 'google.com.hk', 'hn': 'google.hn', 'hr': 'google.hr', 'ht': 'google.ht', 'hu': 'google.hu', 'id': 'google.co.id', 'ie': 'google.ie', 'il': 'google.co.il', 'im': 'google.im', 'in': 'google.co.in', 'io': 'google.io', 'iq': 'google.iq', 'is': 'google.is', 'it': 'google.it', 'je': 'google.je', 'jm': 'google.com.jm', 'jo': 'google.jo', 'jp': 'google.co.jp', 'ke': 'google.co.ke', 'kg': 'google.kg', 'kh': 'google.com.kh', 'ki': 'google.ki', 'kr': 'google.co.kr', 'kw': 'google.com.kw', 'kz': 'google.kz', 'la': 'google.la', 'lb': 'google.com.lb', 'lc': 'google.com.lc', 'li': 'google.li', 'lk': 'google.lk', 'ls': 'google.co.ls', 'lt': 'google.lt', 'lu': 'google.lu', 'lv': 'google.lv', 'ly': 'google.com.ly', 'ma': 'google.co.ma', 'md': 'google.md', 'me': 'google.me', 'mg': 'google.mg', 'mk': 'google.mk', 'ml': 'google.ml', 'mm': 'google.com.mm', 'mn': 'google.mn', 'ms': 'google.ms', 'mt': 'google.com.mt', 'mu': 'google.mu', 'mv': 'google.mv', 'mw': 'google.mw', 'mx': 'google.com.mx', 'my': 'google.com.my', 'mz': 'google.co.mz', 'na': 'google.com.na', 'ne': 'google.ne', 'nf': 'google.com.nf', 'ng': 'google.com.ng', 'ni': 'google.com.ni', 'nl': 'google.nl', 'no': 'google.no', 'np': 'google.com.np', 'nr': 'google.nr', 'nu': 'google.nu', 'nz': 'google.co.nz', 'om': 'google.com.om', 'pa': 'google.com.pa', 'pe': 'google.com.pe', 'pg': 'google.com.pg', 'ph': 'google.com.ph', 'pk': 'google.com.pk', 'pl': 'google.pl', 'pn': 'google.co.pn', 'pr': 'google.com.pr', 'ps': 'google.ps', 'pt': 'google.pt', 'py': 'google.com.py', 'qa': 'google.com.qa', 'ro': 'google.ro', 'rs': 'google.rs', 'ru': 'google.ru', 'rw': 'google.rw', 'sa': 'google.com.sa', 'sb': 'google.com.sb', 'sc': 'google.sc', 'se': 'google.se', 'sg': 'google.com.sg', 'sh': 'google.sh', 'si': 'google.si', 'sk': 'google.sk', 'sl': 'google.com.sl', 'sm': 'google.sm', 'sn': 'google.sn', 'so': 'google.so', 'sr': 'google.sr', 'st': 'google.st', 'sv': 'google.com.sv', 'td': 'google.td', 'tg': 'google.tg', 'th': 'google.co.th', 'tj': 'google.com.tj', 'tk': 'google.tk', 'tl': 'google.tl', 'tm': 'google.tm', 'tn': 'google.tn', 'to': 'google.to', 'tr': 'google.com.tr', 'tt': 'google.tt', 'tw': 'google.com.tw', 'tz': 'google.co.tz', 'ua': 'google.com.ua', 'ug': 'google.co.ug', 'uk': 'google.co.uk', 'uy': 'google.com.uy', 'uz': 'google.co.uz', 'vc': 'google.com.vc', 've': 'google.co.ve', 'vg': 'google.vg', 'vi': 'google.co.vi', 'vn': 'google.com.vn', 'vu': 'google.vu', 'ws': 'google.ws', 'za': 'google.co.za', 'zm': 'google.co.zm', 'zw': 'google.co.zw', } @property def netloc(self): """The hostname.""" try: return 'www.' + self.TLD_TO_DOMAIN_MAP[self._tld] except KeyError: return 'www.google.com' @property def query(self): """The query string.""" qd = {} qd.update(self._query_dict) if self._num != 10: # Skip sending the default qd['num'] = self._num if self._start: # Skip sending the default qd['start'] = self._start # Construct the q query q = '' keywords = self._keywords sites = self._sites if keywords: if isinstance(keywords, list): q += '+'.join(urllib.parse.quote_plus(kw) for kw in keywords) else: q += urllib.parse.quote_plus(keywords) if sites: q += '+OR'.join('+site:' + urllib.parse.quote_plus(site) for site in sites) qd['q'] = q return '&'.join('%s=%s' % (k, qd[k]) for k in sorted(qd.keys())) class GoogleConnectionError(Exception): pass class GoogleConnection(object): """ This class facilitates connecting to and fetching from Google. Parameters ---------- See http.client.HTTPSConnection for documentation of the parameters. Raises ------ GoogleConnectionError Attributes ---------- host : str The currently connected host. Read-only property. Use `new_connection` to change host. Methods ------- new_connection(host=None, port=None, timeout=45) renew_connection(timeout=45) fetch_page(url) close() """ def __init__(self, host, port=None, timeout=45, proxy=None, notweak=False): self._host = None self._port = None self._proxy = proxy self._notweak = notweak self._conn = None self.new_connection(host, port=port, timeout=timeout) self.cookie = '' @property def host(self): """The host currently connected to.""" return self._host def new_connection(self, host=None, port=None, timeout=45): """Close the current connection (if any) and establish a new one. Parameters ---------- See http.client.HTTPSConnection for documentation of the parameters. Renew the connection (i.e., reuse the current host and port) if host is None or empty. Raises ------ GoogleConnectionError """ if self._conn: self._conn.close() if not host: host = self._host port = self._port self._host = host self._port = port host_display = host + (':%d' % port if port else '') proxy = self._proxy if proxy: proxy_user_passwd, proxy_host_port = parse_proxy_spec(proxy) logger.debug('Connecting to proxy server %s', proxy_host_port) self._conn = HardenedHTTPSConnection(proxy_host_port, timeout=timeout) logger.debug('Tunnelling to host %s' % host_display) connect_headers = {} if proxy_user_passwd: connect_headers['Proxy-Authorization'] = 'Basic %s' % base64.b64encode( proxy_user_passwd.encode('utf-8') ).decode('utf-8') self._conn.set_tunnel(host, port=port, headers=connect_headers) try: self._conn.connect(self._notweak) except Exception as e: msg = 'Failed to connect to proxy server %s: %s.' % (proxy, e) raise GoogleConnectionError(msg) else: logger.debug('Connecting to new host %s', host_display) self._conn = HardenedHTTPSConnection(host, port=port, timeout=timeout) try: self._conn.connect(self._notweak) except Exception as e: msg = 'Failed to connect to %s: %s.' % (host_display, e) raise GoogleConnectionError(msg) def renew_connection(self, timeout=45): """Renew current connection. Equivalent to ``new_connection(timeout=timeout)``. """ self.new_connection(timeout=timeout) def fetch_page(self, url): """Fetch a URL. Allows one reconnection and multiple redirections before failing and raising GoogleConnectionError. Parameters ---------- url : str The URL to fetch, relative to the host. Raises ------ GoogleConnectionError When not getting HTTP 200 even after the allowed one reconnection and/or one redirection, or when Google is blocking query due to unusual activity. Returns ------- str Response payload, gunzipped (if applicable) and decoded (in UTF-8). """ try: self._raw_get(url) except (http.client.HTTPException, OSError) as e: logger.debug('Got exception: %s.', e) logger.debug('Attempting to reconnect...') self.renew_connection() try: self._raw_get(url) except http.client.HTTPException as e: logger.debug('Got exception: %s.', e) raise GoogleConnectionError("Failed to get '%s'." % url) resp = self._resp redirect_counter = 0 while resp.status != 200 and redirect_counter < 3: if resp.status in {301, 302, 303, 307, 308}: redirection_url = resp.getheader('location', '') if 'sorry/IndexRedirect?' in redirection_url or 'sorry/index?' in redirection_url: msg = textwrap.dedent("""\ Connection blocked due to unusual activity. THIS IS NOT A BUG, please do NOT report it as a bug unless you have specific information that may lead to the development of a workaround. You IP address is temporarily or permanently blocked by Google and requires reCAPTCHA-solving to use the service, which googler is not capable of. Possible causes include issuing too many queries in a short time frame, or operating from a shared / low reputation IP with a history of abuse. Please do NOT use googler for automated scraping.""") msg = " ".join(msg.splitlines()) raise GoogleConnectionError(msg) self._redirect(redirection_url) resp = self._resp redirect_counter += 1 else: break if resp.status != 200: raise GoogleConnectionError('Got HTTP %d: %s' % (resp.status, resp.reason)) payload = resp.read() try: return gzip.decompress(payload).decode('utf-8') except OSError: # Not gzipped return payload.decode('utf-8') def _redirect(self, url): """Redirect to and fetch a new URL. Like `_raw_get`, the response is stored in ``self._resp``. A new connection is made if redirecting to a different host. Parameters ---------- url : str If absolute and points to a different host, make a new connection. Raises ------ GoogleConnectionError """ logger.debug('Redirecting to URL %s', url) segments = urllib.parse.urlparse(url) host = segments.netloc if host != self._host: self.new_connection(host) relurl = urllib.parse.urlunparse(('', '') + segments[2:]) try: self._raw_get(relurl) except http.client.HTTPException as e: logger.debug('Got exception: %s.', e) raise GoogleConnectionError("Failed to get '%s'." % url) def _raw_get(self, url): """Make a raw HTTP GET request. No status check (which implies no redirection). Response can be accessed from ``self._resp``. Parameters ---------- url : str URL relative to the host, used in the GET request. Raises ------ http.client.HTTPException """ logger.debug('Fetching URL %s', url) self._conn.request('GET', url, None, { 'Accept': 'text/html', 'Accept-Encoding': 'gzip', 'User-Agent': USER_AGENT, 'Cookie': self.cookie, 'Connection': 'keep-alive', 'DNT': '1', }) self._resp = self._conn.getresponse() if self.cookie == '': complete_cookie = self._resp.getheader('Set-Cookie') # Cookie won't be available if already blocked if complete_cookie is not None: self.cookie = complete_cookie[:complete_cookie.find(';')] logger.debug('Cookie: %s' % self.cookie) def close(self): """Close the connection (if one is active).""" if self._conn: self._conn.close() class GoogleParser(object): def __init__(self, html, *, news=False): self.news = news self.autocorrected = False self.showing_results_for = None self.filtered = False self.results = [] self.parse(html) def parse(self, html): tree = parse_html(html) if debugger: printerr('\x1b[1mInspect the DOM through the \x1b[4mtree\x1b[24m variable.\x1b[0m') printerr('') try: import IPython IPython.embed() except ImportError: import pdb pdb.set_trace() index = 0 for div_g in tree.select_all('div.g'): if div_g.select('.hp-xpdbox'): # Skip smart cards. continue try: h3 = div_g.select('div.r h3') if h3: title = h3.text url = self.unwrap_link(h3.parent.attr('href')) else: h3 = div_g.select('h3.r') a = h3.select('a') title = a.text mime = div_g.select('.mime') if mime: title = mime.text + ' ' + title url = self.unwrap_link(a.attr('href')) matched_keywords = [] abstract = '' for childnode in div_g.select('.st').children: if 'f' in childnode.classes: # .f is handled as metadata instead. continue if childnode.tag == 'b' and childnode.text != '...': matched_keywords.append({'phrase': childnode.text, 'offset': len(abstract)}) abstract = abstract + childnode.text.replace('\n', '') try: metadata = div_g.select('.f').text metadata = metadata.replace('\u200e', '').replace(' - ', ', ').strip() except AttributeError: metadata = None except (AttributeError, ValueError): continue sitelinks = [] for td in div_g.select_all('td'): try: a = td.select('a') sl_title = a.text sl_url = self.unwrap_link(a.attr('href')) sl_abstract = td.select('div.s.st').text sitelinks.append(Sitelink(sl_title, sl_url, sl_abstract)) except (AttributeError, ValueError): continue index += 1 self.results.append(Result(index, title, url, abstract, metadata=metadata, sitelinks=sitelinks, matches=matched_keywords)) # Showing results for ... # Search instead for ... spell_orig = tree.select("span.spell_orig") if spell_orig: showing_results_for_link = next( filter(lambda el: el.tag == "a", spell_orig.previous_siblings()), None ) if showing_results_for_link: self.autocorrected = True self.showing_results_for = showing_results_for_link.text # No results found for ... # Results for ...: alt_query_infobox = tree.select('#topstuff') if alt_query_infobox: bolds = alt_query_infobox.select_all('div b') if len(bolds) == 2: self.showing_results_for = bolds[1].text # In order to show you the most relevant results, we have # omitted some entries very similar to the N already displayed. # ... self.filtered = tree.select('p#ofr') is not None # Unwraps /url?q=http://...&sa=... # TODO: don't unwrap if URL isn't in this form. @staticmethod def unwrap_link(link): qs = urllib.parse.urlparse(link).query try: url = urllib.parse.parse_qs(qs)['q'][0] except KeyError: return link else: if "://" in url: return url else: # Google's internal services link, e.g., # /search?q=google&..., which cannot be unwrapped into # an actual URL. raise ValueError(link) class Sitelink(object): """Container for a sitelink.""" def __init__(self, title, url, abstract): self.title = title self.url = url self.abstract = abstract self.index = '' Colors = collections.namedtuple('Colors', 'index, title, url, metadata, abstract, prompt, reset') class Result(object): """ Container for one search result, with output helpers. Parameters ---------- index : int or str title : str url : str abstract : str metadata : str, optional Only applicable to Google News results, with publisher name and publishing time. sitelinks : list, optional List of ``SiteLink`` objects. Attributes ---------- index : str title : str url : str abstract : str metadata : str or None sitelinks : list matches : list Class Variables --------------- colors : str Methods ------- print() jsonizable_object() urltable() """ # Class variables colors = None urlexpand = True def __init__(self, index, title, url, abstract, metadata=None, sitelinks=None, matches=None): index = str(index) self.index = index self.title = title self.url = url self.abstract = abstract self.metadata = metadata self.sitelinks = [] if sitelinks is None else sitelinks self.matches = [] if matches is None else matches self._urltable = {index: url} subindex = 'a' for sitelink in self.sitelinks: fullindex = index + subindex sitelink.index = fullindex self._urltable[fullindex] = sitelink.url subindex = chr(ord(subindex) + 1) def _print_title_and_url(self, index, title, url, indent=0): colors = self.colors if not self.urlexpand: url = '[' + urllib.parse.urlparse(url).netloc + ']' if colors: # Adjust index to print result index clearly print(" %s%s%-3s%s" % (' ' * indent, colors.index, index + '.', colors.reset), end='') if not self.urlexpand: print(' ' + colors.title + title + colors.reset + ' ' + colors.url + url + colors.reset) else: print(' ' + colors.title + title + colors.reset) print(' ' * (indent + 5) + colors.url + url + colors.reset) else: if self.urlexpand: print(' %s%-3s %s' % (' ' * indent, index + '.', title)) print(' %s%s' % (' ' * (indent + 4), url)) else: print(' %s%-3s %s %s' % (' ' * indent, index + '.', title, url)) def _print_metadata_and_abstract(self, abstract, metadata=None, matches=None, indent=0): colors = self.colors try: columns, _ = os.get_terminal_size() except OSError: columns = 0 if metadata: if colors: print(' ' * (indent + 5) + colors.metadata + metadata + colors.reset) else: print(' ' * (indent + 5) + metadata) fillwidth = (columns - (indent + 6)) if columns > indent + 6 else len(abstract) wrapped_abstract = TrackedTextwrap(abstract, fillwidth) if colors: # Highlight matches. for match in matches or []: offset = match['offset'] span = len(match['phrase']) wrapped_abstract.insert_zero_width_sequence('\x1b[1m', offset) wrapped_abstract.insert_zero_width_sequence('\x1b[0m', offset + span) if colors: print(colors.abstract, end='') for line in wrapped_abstract.lines: print('%s%s' % (' ' * (indent + 5), line)) if colors: print(colors.reset, end='') print('') def print(self): """Print the result entry.""" self._print_title_and_url(self.index, self.title, self.url) self._print_metadata_and_abstract(self.abstract, metadata=self.metadata, matches=self.matches) for sitelink in self.sitelinks: self._print_title_and_url(sitelink.index, sitelink.title, sitelink.url, indent=4) self._print_metadata_and_abstract(sitelink.abstract, indent=4) def jsonizable_object(self): """Return a JSON-serializable dict representing the result entry.""" obj = { 'title': self.title, 'url': self.url, 'abstract': self.abstract } if self.metadata: obj['metadata'] = self.metadata if self.sitelinks: obj['sitelinks'] = [sitelink.__dict__ for sitelink in self.sitelinks] if self.matches: obj['matches'] = self.matches return obj def urltable(self): """Return a index-to-URL table for the current result. Normally, the table contains only a single entry, but when the result contains sitelinks, all sitelinks are included in this table. Returns ------- dict A dict mapping indices (strs) to URLs (also strs). Indices of sitelinks are the original index appended by lowercase letters a, b, c, etc. """ return self._urltable class GooglerCmdException(Exception): pass class NoKeywordsException(GooglerCmdException): pass def require_keywords(method): # Require keywords to be set before we run a GooglerCmd method. If # no keywords have been set, raise a NoKeywordsException. @functools.wraps(method) def enforced_method(self, *args, **kwargs): if not self.keywords: raise NoKeywordsException('No keywords.') method(self, *args, **kwargs) return enforced_method def no_argument(method): # Normalize a do_* method of GooglerCmd that takes no argument to # one that takes an arg, but issue a warning when an nonempty # argument is given. @functools.wraps(method) def enforced_method(self, arg): if arg: method_name = arg.__name__ command_name = method_name[3:] if method_name.startswith('do_') else method_name logger.warning("Argument to the '%s' command ignored.", command_name) method(self) return enforced_method class GooglerCmd(object): """ Command line interpreter and executor class for googler. Inspired by PSL cmd.Cmd. Parameters ---------- opts : argparse.Namespace Options and/or arguments. Attributes ---------- options : argparse.Namespace Options that are currently in effect. Read-only attribute. keywords : str or list or strs Current keywords. Read-only attribute Methods ------- fetch() display_results(prelude='\n', json_output=False) fetch_and_display(prelude='\n', json_output=False, interactive=True) read_next_command() help() cmdloop() """ # Class variables colors = None re_url_index = re.compile(r"\d+(a-z)?") def __init__(self, opts): super().__init__() self._opts = opts self._google_url = GoogleUrl(opts) proxy = opts.proxy if hasattr(opts, 'proxy') else None self._conn = GoogleConnection(self._google_url.hostname, proxy=proxy, notweak=opts.notweak) atexit.register(self._conn.close) self.results = [] self._autocorrected = None self._showing_results_for = None self._results_filtered = False self._urltable = {} self.promptcolor = True if os.getenv('DISABLE_PROMPT_COLOR') is None else False self.no_results_instructions_shown = False @property def options(self): """Current options.""" return self._opts @property def keywords(self): """Current keywords.""" return self._google_url.keywords @require_keywords def fetch(self): """Fetch a page and parse for results. Results are stored in ``self.results``. Raises ------ GoogleConnectionError See Also -------- fetch_and_display """ # This method also sets self._results_filtered and # self._urltable. page = self._conn.fetch_page(self._google_url.relative()) if logger.isEnabledFor(logging.DEBUG): import tempfile fd, tmpfile = tempfile.mkstemp(prefix='googler-response-', suffix='.html') os.close(fd) with open(tmpfile, 'w', encoding='utf-8') as fp: fp.write(page) logger.debug("Response body written to '%s'.", tmpfile) parser = GoogleParser(page, news=self._google_url.news) self.results = parser.results self._autocorrected = parser.autocorrected self._showing_results_for = parser.showing_results_for self._results_filtered = parser.filtered self._urltable = {} for r in self.results: self._urltable.update(r.urltable()) def warn_no_results(self): printerr('No results.') if not self.no_results_instructions_shown: printerr('If you believe this is a bug, please review ' 'https://git.io/googler-no-results before submitting a bug report.') self.no_results_instructions_shown = True @require_keywords def display_results(self, prelude='\n', json_output=False): """Display results stored in ``self.results``. Parameters ---------- See `fetch_and_display`. """ if json_output: # JSON output import json results_object = [r.jsonizable_object() for r in self.results] print(json.dumps(results_object, indent=2, sort_keys=True, ensure_ascii=False)) else: # Regular output if not self.results: self.warn_no_results() else: sys.stderr.write(prelude) for r in self.results: r.print() @require_keywords def showing_results_for_alert(self, interactive=True): colors = self.colors if self._showing_results_for: if colors: # Underline the query actual_query = '\x1b[4m' + self._showing_results_for + '\x1b[24m' else: actual_query = self._showing_results_for if self._autocorrected: if interactive: info = 'Showing results for %s; enter "x" for an exact search.' % actual_query else: info = 'Showing results for %s; use -x, --exact for an exact search.' % actual_query else: info = 'No results found; showing results for %s.' % actual_query if interactive: printerr('') if colors: printerr(colors.prompt + info + colors.reset) else: printerr('** ' + info) @require_keywords def fetch_and_display(self, prelude='\n', json_output=False, interactive=True): """Fetch a page and display results. Results are stored in ``self.results``. Parameters ---------- prelude : str, optional A string that is written to stderr before showing actual results, usually serving as a separator. Default is an empty line. json_output : bool, optional Whether to dump results in JSON format. Default is False. interactive : bool, optional Whether to show contextual instructions, when e.g. Google has filtered the results. Default is True. Raises ------ GoogleConnectionError See Also -------- fetch display_results """ self.fetch() self.showing_results_for_alert() self.display_results(prelude=prelude, json_output=json_output) if self._results_filtered: colors = self.colors info = 'Enter "unfilter" to show similar results Google omitted.' if colors: printerr(colors.prompt + info + colors.reset) else: printerr('** ' + info) printerr('') def read_next_command(self): """Show omniprompt and read user command line. Command line is always stripped, and each consecutive group of whitespace is replaced with a single space character. If the command line is empty after stripping, when ignore it and keep reading. Exit with status 0 if we get EOF or an empty line (pre-strip, that is, a raw ) twice in a row. The new command line (non-empty) is stored in ``self.cmd``. """ colors = self.colors message = 'googler (? for help)' prompt = (colors.prompt + message + colors.reset + ' ') if (colors and self.promptcolor) else (message + ': ') enter_count = 0 while True: try: cmd = input(prompt) except EOFError: sys.exit(0) if not cmd: enter_count += 1 if enter_count == 2: # Double sys.exit(0) else: enter_count = 0 cmd = ' '.join(cmd.split()) if cmd: self.cmd = cmd break @staticmethod def help(): GooglerArgumentParser.print_omniprompt_help(sys.stderr) printerr('') @require_keywords @no_argument def do_first(self): try: self._google_url.first_page() except ValueError as e: print(e, file=sys.stderr) return self.fetch_and_display() def do_google(self, arg): # Update keywords and reconstruct URL self._opts.keywords = arg self._google_url = GoogleUrl(self._opts) self.fetch_and_display() @require_keywords @no_argument def do_next(self): # If > 5 results are being fetched each time, # block next when no parsed results in current fetch if not self.results and self._google_url._num > 5: printerr('No results.') else: self._google_url.next_page() self.fetch_and_display() @require_keywords def do_open(self, *args): if not args: open_url(self._google_url.full()) return for nav in args: if nav == 'a': for key, value in sorted(self._urltable.items()): open_url(self._urltable[key]) elif nav in self._urltable: open_url(self._urltable[nav]) elif '-' in nav: try: vals = [int(x) for x in nav.split('-')] if (len(vals) != 2): printerr('Invalid range %s.' % nav) continue if vals[0] > vals[1]: vals[0], vals[1] = vals[1], vals[0] for _id in range(vals[0], vals[1] + 1): if str(_id) in self._urltable: open_url(self._urltable[str(_id)]) else: printerr('Invalid index %s.' % _id) except ValueError: printerr('Invalid range %s.' % nav) else: printerr('Invalid index %s.' % nav) @require_keywords @no_argument def do_previous(self): try: self._google_url.prev_page() except ValueError as e: print(e, file=sys.stderr) return self.fetch_and_display() @require_keywords @no_argument def do_exact(self): # Reset start to 0 when exact is applied. self._google_url.update(start=0, exact=True) self.fetch_and_display() @require_keywords @no_argument def do_unfilter(self): # Reset start to 0 when unfilter is applied. self._google_url.update(start=0) self._google_url.set_queries(filter=0) self.fetch_and_display() def copy_url(self, idx): try: try: content = self._urltable[idx].encode('utf-8') except KeyError: printerr('Invalid index.') return # try copying the url to clipboard using native utilities copier_params = [] if sys.platform.startswith(('linux', 'freebsd', 'openbsd')): if shutil.which('xsel') is not None: copier_params = ['xsel', '-b', '-i'] elif shutil.which('xclip') is not None: copier_params = ['xclip', '-selection', 'clipboard'] elif shutil.which('termux-clipboard-set') is not None: copier_params = ['termux-clipboard-set'] elif sys.platform == 'darwin': copier_params = ['pbcopy'] elif sys.platform == 'win32': copier_params = ['clip'] if copier_params: Popen(copier_params, stdin=PIPE, stdout=DEVNULL, stderr=DEVNULL).communicate(content) return # If native clipboard utilities are absent, try to use terminal multiplexers # tmux if os.getenv('TMUX_PANE'): copier_params = ['tmux', 'set-buffer'] Popen(copier_params + [content], stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL).communicate() return # GNU Screen paste buffer if os.getenv('STY'): import tempfile copier_params = ['screen', '-X', 'readbuf', '-e', 'utf8'] tmpfd, tmppath = tempfile.mkstemp() try: with os.fdopen(tmpfd, 'wb') as fp: fp.write(content) copier_params.append(tmppath) Popen(copier_params, stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL).communicate() finally: os.unlink(tmppath) return printerr('failed to locate suitable clipboard utility') except Exception: raise NoKeywordsException def cmdloop(self): """Run REPL.""" if self.keywords: self.fetch_and_display() else: printerr('Please initiate a query.') while True: self.read_next_command() # TODO: Automatic dispatcher # # We can't write a dispatcher for now because that could # change behaviour of the prompt. However, we have already # laid a lot of ground work for the dispatcher, e.g., the # `no_argument' decorator. try: cmd = self.cmd if cmd == 'f': self.do_first('') elif cmd.startswith('g '): self.do_google(cmd[2:]) elif cmd == 'n': self.do_next('') elif cmd == 'o': self.do_open() elif cmd.startswith('o '): self.do_open(*cmd[2:].split()) elif cmd.startswith('O '): open_url.override_text_browser = True self.do_open(*cmd[2:].split()) open_url.override_text_browser = False elif cmd == 'p': self.do_previous('') elif cmd == 'q': break elif cmd == 'x': self.do_exact('') elif cmd == 'unfilter': self.do_unfilter('') elif cmd == '?': self.help() elif cmd in self._urltable: open_url(self._urltable[cmd]) elif self.keywords and cmd.isdigit() and int(cmd) < 100: printerr('Index out of bound. To search for the number, use g.') elif cmd == 'u': Result.urlexpand = not Result.urlexpand self.display_results() elif cmd.startswith('c ') and self.re_url_index.match(cmd[2:]): self.copy_url(cmd[2:]) else: self.do_google(cmd) except NoKeywordsException: printerr('Initiate a query first.') class GooglerArgumentParser(argparse.ArgumentParser): """Custom argument parser for googler.""" # Print omniprompt help @staticmethod def print_omniprompt_help(file=None): file = sys.stderr if file is None else file file.write(textwrap.dedent(""" omniprompt keys: n, p fetch the next or previous set of search results index open the result corresponding to index in browser f jump to the first page o [index|range|a ...] open space-separated result indices, numeric ranges (sitelinks unsupported in ranges), or all, in browser open the current search in browser, if no arguments O [index|range|a ...] like key 'o', but try to open in a GUI browser g keywords new Google search for 'keywords' with original options should be used to search omniprompt keys and indices c index copy url to clipboard u toggle url expansion q, ^D, double Enter exit googler ? show omniprompt help * other inputs issue a new search with original options """)) # Print information on googler @staticmethod def print_general_info(file=None): file = sys.stderr if file is None else file file.write(textwrap.dedent(""" Version %s Copyright © 2008 Henri Hakkinen Copyright © 2015-2019 Arun Prakash Jana Zhiming Wang License: GPLv3 Webpage: https://github.com/jarun/googler """ % _VERSION_)) # Augment print_help to print more than synopsis and options def print_help(self, file=None): super().print_help(file) self.print_omniprompt_help(file) self.print_general_info(file) # Automatically print full help text on error def error(self, message): sys.stderr.write('%s: error: %s\n\n' % (self.prog, message)) self.print_help(sys.stderr) self.exit(2) # Type guards @staticmethod def positive_int(arg): """Try to convert a string into a positive integer.""" try: n = int(arg) assert n > 0 return n except (ValueError, AssertionError): raise argparse.ArgumentTypeError('%s is not a positive integer' % arg) @staticmethod def nonnegative_int(arg): """Try to convert a string into a nonnegative integer.""" try: n = int(arg) assert n >= 0 return n except (ValueError, AssertionError): raise argparse.ArgumentTypeError('%s is not a non-negative integer' % arg) @staticmethod def is_duration(arg): """Check if a string is a valid duration accepted by Google. A valid duration is of the form dNUM, where d is a single letter h (hour), d (day), w (week), m (month), or y (year), and NUM is a non-negative integer. """ try: if arg[0] not in ('h', 'd', 'w', 'm', 'y') or int(arg[1:]) < 0: raise ValueError except (TypeError, IndexError, ValueError): raise argparse.ArgumentTypeError('%s is not a valid duration' % arg) return arg @staticmethod def is_colorstr(arg): """Check if a string is a valid color string.""" try: assert len(arg) == 6 for c in arg: assert c in COLORMAP except AssertionError: raise argparse.ArgumentTypeError('%s is not a valid color string' % arg) return arg # Self-upgrade mechanism def system_is_windows(): """Checks if the underlying system is Windows (Cygwin included).""" return sys.platform in {'win32', 'cygwin'} def download_latest_googler(include_git=False): """Download latest googler to a temp file. By default, the latest released version is downloaded, but if `include_git` is specified, then the latest git master is downloaded instead. Parameters ---------- include_git : bool, optional Download from git master. Default is False. Returns ------- (git_ref, path): tuple A tuple containing the git reference (either name of the latest tag or SHA of the latest commit) and path to the downloaded file. """ import urllib.request if include_git: # Get SHA of latest commit on master request = urllib.request.Request('%s/commits/master' % API_REPO_BASE, headers={'Accept': 'application/vnd.github.v3.sha'}) response = urllib.request.urlopen(request) if response.status != 200: raise http.client.HTTPException(response.reason) git_ref = response.read().decode('utf-8') else: # Get name of latest tag request = urllib.request.Request('%s/releases?per_page=1' % API_REPO_BASE, headers={'Accept': 'application/vnd.github.v3+json'}) response = urllib.request.urlopen(request) if response.status != 200: raise http.client.HTTPException(response.reason) import json git_ref = json.loads(response.read().decode('utf-8'))[0]['tag_name'] # Download googler to a tempfile googler_download_url = '%s/%s/googler' % (RAW_DOWNLOAD_REPO_BASE, git_ref) printerr('Downloading %s' % googler_download_url) request = urllib.request.Request(googler_download_url, headers={'Accept-Encoding': 'gzip'}) import tempfile fd, path = tempfile.mkstemp() atexit.register(lambda: os.remove(path) if os.path.exists(path) else None) os.close(fd) with open(path, 'wb') as fp: with urllib.request.urlopen(request) as response: if response.status != 200: raise http.client.HTTPException(response.reason) payload = response.read() try: fp.write(gzip.decompress(payload)) except OSError: fp.write(payload) return git_ref, path def self_replace(path): """Replace the current script with a specified file. Both paths (the specified path and path to the current script) are resolved to absolute, symlink-free paths. Upon replacement, the owner and mode signatures of the current script are preserved. The caller needs to have the necessary permissions. Replacement won't happen if the specified file is the same (content-wise) as the current script. Parameters ---------- path : str Path to the replacement file. Returns ------- bool True if replaced, False if skipped (specified file is the same as the current script). """ if system_is_windows(): raise NotImplementedError('Self upgrade not supported on Windows.') import filecmp import shutil path = os.path.realpath(path) self_path = os.path.realpath(__file__) if filecmp.cmp(path, self_path): return False self_stat = os.stat(self_path) os.chown(path, self_stat.st_uid, self_stat.st_gid) os.chmod(path, self_stat.st_mode) shutil.move(path, self_path) return True def self_upgrade(include_git=False): """Perform in-place self-upgrade. Parameters ---------- include_git : bool, optional See `download_latest_googler`. Default is False. """ git_ref, path = download_latest_googler(include_git=include_git) if self_replace(path): printerr('Upgraded to %s.' % git_ref) else: printerr('Already up to date.') # Miscellaneous functions def python_version(): return '%d.%d.%d' % sys.version_info[:3] def https_proxy_from_environment(): return os.getenv('https_proxy') def parse_proxy_spec(proxyspec): if '://' in proxyspec: pos = proxyspec.find('://') scheme = proxyspec[:pos] proxyspec = proxyspec[pos+3:] if scheme.lower() != 'http': # Only support HTTP proxies. # # In particular, we don't support HTTPS proxies since we # only speak plain HTTP to the proxy server, so don't give # users a false sense of security. raise NotImplementedError('Unsupported proxy scheme %s.' % scheme) if '@' in proxyspec: pos = proxyspec.find('@') user_passwd = urllib.parse.unquote(proxyspec[:pos]) # Remove trailing '/' if any host_port = proxyspec[pos+1:].rstrip('/') else: user_passwd = None host_port = proxyspec.rstrip('/') if ':' not in host_port: # Use port 1080 as default, following curl. host_port += ':1080' return user_passwd, host_port def set_win_console_mode(): # VT100 control sequences are supported on Windows 10 Anniversary Update and later. # https://docs.microsoft.com/en-us/windows/console/console-virtual-terminal-sequences # https://docs.microsoft.com/en-us/windows/console/setconsolemode if platform.release() == '10': STD_OUTPUT_HANDLE = -11 STD_ERROR_HANDLE = -12 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004 try: from ctypes import windll, wintypes, byref kernel32 = windll.kernel32 for nhandle in (STD_OUTPUT_HANDLE, STD_ERROR_HANDLE): handle = kernel32.GetStdHandle(nhandle) old_mode = wintypes.DWORD() if not kernel32.GetConsoleMode(handle, byref(old_mode)): raise RuntimeError('GetConsoleMode failed') new_mode = old_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING if not kernel32.SetConsoleMode(handle, new_mode): raise RuntimeError('SetConsoleMode failed') # Note: No need to restore at exit. SetConsoleMode seems to # be limited to the calling process. except Exception: pass # Query autocompleter # This function is largely experimental and could raise any exception; # you should be prepared to catch anything. When it works though, it # returns a list of strings the prefix could autocomplete to (however, # it is not guaranteed that they start with the specified prefix; for # instance, they won't if the specified prefix ends in a punctuation # mark.) def completer_fetch_completions(prefix): import html import json import re import urllib.request # One can pass the 'hl' query param to specify the language. We # ignore that for now. api_url = ('https://www.google.com/complete/search?client=psy-ab&q=%s' % urllib.parse.quote(prefix, safe='')) # A timeout of 3 seconds seems to be overly generous already. resp = urllib.request.urlopen(api_url, timeout=3) charset = resp.headers.get_content_charset() logger.debug('Completions charset: %s', charset) respobj = json.loads(resp.read().decode(charset)) # The response object, once parsed as JSON, should look like # # ['git', # [['github', 0], # ['git', 0], # ['gitlab', 0], # ['git stash', 0]], # {'q': 'oooAhRzoChqNmMbNaaDKXk1YY4k', 't': {'bpc': False, 'tlw': False}}] # # Note the each result entry need not have two members; e.g., for # 'gi', there is an entry ['gif', 0, [131]]. HTML_TAG = re.compile(r'<[^>]+>') return [html.unescape(HTML_TAG.sub('', entry[0])) for entry in respobj[1]] def completer_run(prefix): if prefix: completions = completer_fetch_completions(prefix) if completions: print('\n'.join(completions)) sys.exit(0) def parse_args(args=None, namespace=None): """Parse googler arguments/options. Parameters ---------- args : list, optional Arguments to parse. Default is ``sys.argv``. namespace : argparse.Namespace Namespace to write to. Default is a new namespace. Returns ------- argparse.Namespace Namespace with parsed arguments / options. """ colorstr_env = os.getenv('GOOGLER_COLORS') argparser = GooglerArgumentParser(description='Google from the command-line.') addarg = argparser.add_argument addarg('-s', '--start', type=argparser.nonnegative_int, default=0, metavar='N', help='start at the Nth result') addarg('-n', '--count', dest='num', type=argparser.positive_int, default=10, metavar='N', help='show N results (default 10)') addarg('-N', '--news', action='store_true', help='show results from news section') addarg('-c', '--tld', metavar='TLD', help="""country-specific search with top-level domain .TLD, e.g., 'in' for India""") addarg('-l', '--lang', metavar='LANG', help='display in language LANG') addarg('-x', '--exact', action='store_true', help='disable automatic spelling correction') addarg('--colorize', nargs='?', choices=['auto', 'always', 'never'], const='always', default='auto', help="""whether to colorize output; defaults to 'auto', which enables color when stdout is a tty device; using --colorize without an argument is equivalent to --colorize=always""") addarg('-C', '--nocolor', action='store_true', help='equivalent to --colorize=never') addarg('--colors', dest='colorstr', type=argparser.is_colorstr, default=colorstr_env if colorstr_env else 'GKlgxy', metavar='COLORS', help='set output colors (see man page for details)') addarg('-j', '--first', '--lucky', dest='lucky', action='store_true', help='open the first result in web browser and exit') addarg('-t', '--time', dest='duration', type=argparser.is_duration, metavar='dN', help='time limit search ' '[h5 (5 hrs), d5 (5 days), w5 (5 weeks), m5 (5 months), y5 (5 years)]') addarg('-w', '--site', dest='sites', action='append', metavar='SITE', help='search a site using Google') addarg('--unfilter', action='store_true', help='do not omit similar results') addarg('-p', '--proxy', default=https_proxy_from_environment(), help="""tunnel traffic through an HTTP proxy; PROXY is of the form [http://][user:password@]proxyhost[:port]""") addarg('--noua', action='store_true', help='legacy option (no effect)') addarg('--notweak', action='store_true', help='disable TCP optimizations and forced TLS 1.2') addarg('--json', action='store_true', help='output in JSON format; implies --noprompt') addarg('--url-handler', metavar='UTIL', help='custom script or cli utility to open results') addarg('--show-browser-logs', action='store_true', help='do not suppress browser output (stdout and stderr)') addarg('--np', '--noprompt', dest='noninteractive', action='store_true', help='search and exit, do not prompt') addarg('keywords', nargs='*', metavar='KEYWORD', help='search keywords') if ENABLE_SELF_UPGRADE_MECHANISM and not system_is_windows(): addarg('-u', '--upgrade', action='store_true', help='perform in-place self-upgrade') addarg('--include-git', action='store_true', help='when used with --upgrade, get latest git master') addarg('-v', '--version', action='version', version=_VERSION_) addarg('-d', '--debug', action='store_true', help='enable debugging') # Hidden option for interacting with DOM in an IPython/pdb shell addarg('-D', '--debugger', action='store_true', help=argparse.SUPPRESS) addarg('--complete', help=argparse.SUPPRESS) parsed = argparser.parse_args(args, namespace) if parsed.nocolor: parsed.colorize = 'never' return parsed def main(): try: opts = parse_args() # Set logging level if opts.debug: logger.setLevel(logging.DEBUG) logger.debug('googler version %s', _VERSION_) logger.debug('Python version %s', python_version()) if opts.debugger: global debugger debugger = True # Handle query completer if opts.complete is not None: completer_run(opts.complete) # Handle self-upgrade if hasattr(opts, 'upgrade') and opts.upgrade: self_upgrade(include_git=opts.include_git) sys.exit(0) check_stdout_encoding() if opts.keywords: try: # Add cmdline args to readline history readline.add_history(' '.join(opts.keywords)) except Exception: pass # Set colors if opts.colorize == 'always': colorize = True elif opts.colorize == 'auto': colorize = sys.stdout.isatty() else: # opts.colorize == 'never' colorize = False if colorize: colors = Colors(*[COLORMAP[c] for c in opts.colorstr], reset=COLORMAP['x']) else: colors = None Result.colors = colors Result.urlexpand = True if os.getenv('DISABLE_URL_EXPANSION') is None else False GooglerCmd.colors = colors # Try to enable ANSI color support in cmd or PowerShell on Windows 10 if sys.platform == 'win32' and sys.stdout.isatty() and colorize: set_win_console_mode() if opts.url_handler is not None: open_url.url_handler = opts.url_handler else: # Set text browser override to False open_url.override_text_browser = False # Handle browser output suppression if opts.show_browser_logs or (os.getenv('BROWSER') in text_browsers): open_url.suppress_browser_output = False else: open_url.suppress_browser_output = True if opts.noua: logger.warning('--noua option has been deprecated and has no effect (see #284)') repl = GooglerCmd(opts) if opts.json or opts.lucky or opts.noninteractive: # Non-interactive mode repl.fetch() if opts.lucky: if repl.results: open_url(repl.results[0].url) else: print('No results.', file=sys.stderr) else: repl.showing_results_for_alert(interactive=False) repl.display_results(json_output=opts.json) sys.exit(0) else: # Interactive mode repl.cmdloop() except Exception as e: # With debugging on, let the exception through for a traceback; # otherwise, only print the exception error message. if logger.isEnabledFor(logging.DEBUG): raise else: logger.error(e) sys.exit(1) if __name__ == '__main__': main() googler-4.0/googler.1000066400000000000000000000264021356731672100145520ustar00rootroot00000000000000.TH "GOOGLER" "1" "27 Nov 2019" "Version 4.0" "User Commands" .SH NAME googler \- Google from the command-line .SH SYNOPSIS .B googler [OPTIONS] [KEYWORD [KEYWORD ...]] .SH DESCRIPTION .B googler is a command-line tool to search Google (Web & News) from the terminal. Google site search works too. \fBgoogler\fR shows the title, URL and text context for each result. Results are fetched in pages. Next or previous page navigation is possible using keyboard shortcuts. Results are indexed and a result URL can be opened in a browser using the index number. There is no configuration file as aliases serve the same purpose for this utility. Supports sequential searches in a single instance. .PP .B Features .PP * Google Search, Google Site Search, Google News * Fast and clean (no ads, stray URLs or clutter), custom color * Navigate result pages from omniprompt, open URLs in browser * Effortless keyword-based site search with googler @t add-on * Search and option completion scripts for Bash, Zsh and Fish * Fetch n results in a go, start at the nth result * Disable automatic spelling correction and search exact keywords * Specify duration, country/domain (default: worldwide/.com), language * Google keywords (e.g. \fIfiletype:mime\fR, \fIsite:somesite.com\fR) support * Open the first result directly in browser (as in I'm Feeling Lucky) * Non-stop searches: fire new searches at omniprompt without exiting * HTTPS proxy, User Agent, TLS 1.2 (default) support * Comprehensive documentation, man page with handy usage examples * Minimal dependencies .SH OPTIONS .TP .BI "-h, --help" Show help text and exit. .TP .BI "-s, --start=" N Start at the \fIN\fRth result. .TP .BI "-n, --count=" N Show \fIN\fR results (default 10). .TP .BI "-N, --news" Show results from news section. .TP .BI "-c, --tld=" TLD Country-specific search with top-level domain \fI.TLD\fR, e.g., \fBin\fR for India. .TP .BI "-l, --lang=" LANG Search for the language \fILANG\fR, e.g., \fBfi\fR for Finnish. .TP .B "-x, --exact" Disable automatic spelling correction. Search exact keywords. .TP .B "-C, --nocolor" Disable color output. .TP .BI "--colors=" COLORS Set output colors. Refer to the \fBCOLORS\fR section below for details. .TP .B "-j, --first, --lucky" Open the first result in a web browser; implies \fB--noprompt\fR. Feeling Lucky? .TP .BI "-t, --time=" dN Time limit search [h5 (5 hrs), d5 (5 days), w5 (5 weeks), m5 (5 months), y5 (5 years)]. .TP .BI "-w, --site=" SITE Search a site using Google. .TP .BI "--unfilter" Do not omit similar results. .TP .BI "-p, --proxy=" PROXY Tunnel traffic through an HTTP proxy. \fIPROXY\fR is of the form \fI[http://][user:password@]proxyhost[:port]\fR. The proxy server must support HTTP CONNECT tunneling and must not block port 443 for the relevant Google hosts. If a proxy is not explicitly given, the \fIhttps_proxy\fR environment variable (if available) is used instead. .TP .BI "--noua" Disable user agent. Results are fetched faster. .TP .BI "--notweak" Disable TCP optimizations. Negotiate Transport Layer Security protocol instead of forcing TLS 1.2 (on Python 3.4 and above). Should be used only in case of connection issues. .TP .BI "--json" Output in JSON format; implies \fB--noprompt\fR. .TP .BI "--url-handler=" UTIL Custom script or command-line utility to open urls with. .TP .BI "--show-browser-logs" Do not suppress browser output when opening result in browser; that is, connect stdout and stderr of the browser to googler's stdout and stderr instead of /dev/null. By default, browser output is suppressed (due to certain graphical browsers spewing messages to console) unless the \fBBROWSER\fR environment variable is a known text-based browser: elinks, links, lynx, w3m or www-browser. .TP .BI "--np, --noprompt" Perform search and exit; do not prompt for further interactions. .TP .BI "-u, --upgrade" Perform in-place self-upgrade. By default, the latest stable version is used. However, the latest git master is used instead if \fB--include-git\fR is also supplied. This mechanism is not available on Windows (including Cygwin), and if you installed \fBgoogler\fR with a package manager, this mechanism may have been disabled by your packager at packaging or install time. .TP .BI "--include-git" See \fB--upgrade\fR. .TP .BI "-v, --version" Show version number and exit. .TP .BI "-d, --debug" Enable debugging. .SH OMNIPROMPT KEYS .TP .BI "n, p" Fetch the next or previous set of search results. .TP .BI "index" Open the result corresponding to index in browser. .TP .BI "f" Jump to the first page. .TP .BI o " [index|range|a ...]" Open space-separated result indices, numeric ranges (sitelinks unsupported in ranges) or all indices, if 'a' is specified, in the browser. Open the current search in the browser, if no arguments. .TP .BI O " [index|range|a ...]" Works similar to key 'o', but tries to ignore text-based browsers (even if BROWSER is set) and open links in a GUI browser. .TP .BI g " keywords" Initiate a new Google search for \fIkeywords\fR with original options. This key should be used to search omniprompt keys (including itself) and indices. .TP .BI "c index" Copy url to clipboard. .TP .BI "u" Toggle url expansion. .TP .BI "q, ^D, double Enter" Exit googler. .TP .BI "?" Show omniprompt help. .TP .BI * Any other string initiates a new search with original options. .SH GOOGLER @T \fBgoogler @t\fR is a convenient add-on to Google Site Search with unique keywords. While \fBgoogler\fR has an integrated option to search a site, it could be simplified further with aliases. The file \fIgoogler_at\fR (https://github.com/jarun/googler/blob/master/auto-completion/googler_at/googler_at) contains a list of website search aliases. To source it, run: .PP .IP "" 4 .B source googler_at .PP or .PP .IP "" 4 .B . googler_at .PP With \fBgoogler @t\fR, the following command searches Wikipedia for \fIhexspeak\fR: .PP .IP "" 4 .B @w hexspeak .PP Other \fBgoogler\fR options can be combined. The shell can be configured to be source the file at start-up for further convenience. .PP All the aliases start with the \fB@\fR symbol (hence the name \fBgoogler @t\fR) and there is minimum chance they will conflict with any shell commands. Users can add new aliases to the file. .SH COLORS \fBgoogler\fR allows you to customize the color scheme via a six-letter string, reminiscent of BSD \fBLSCOLORS\fR. The six letters represent the colors of .IP - 2 indices .PD 0 \" Change paragraph spacing to 0 in the list .IP - 2 titles .IP - 2 URLs .IP - 2 metadata/publishing info (Google News only) .IP - 2 abstracts .IP - 2 prompts .PD 1 \" Restore paragraph spacing .TP respectively. The six-letter string is passed in either as the argument to the \fB--colors\fR option, or as the value of the environment variable \fBGOOGLER_COLORS\fR. .TP We offer the following colors/styles: .TS tab(;) box; l|l -|- l|l. Letter;Color/Style a;black b;red c;green d;yellow e;blue f;magenta g;cyan h;white i;bright black j;bright red k;bright green l;bright yellow m;bright blue n;bright magenta o;bright cyan p;bright white A-H;bold version of the lowercase-letter color I-P;bold version of the lowercase-letter bright color x;normal X;bold y;reverse video Y;bold reverse video .TE .TP .TP The default colors string is \fIGKlgxy\fR, which stands for .IP - 2 bold bright cyan indices .PD 0 \" Change paragraph spacing to 0 in the list .IP - 2 bold bright green titles .IP - 2 bright yellow URLs .IP - 2 cyan metadata/publishing info .IP - 2 normal abstracts .IP - 2 reverse video prompts .PD 1 \" Restore paragraph spacing .TP Note that .IP - 2 Bright colors (implemented as \\x1b[90m - \\x1b[97m) may not be available in all color-capable terminal emulators; .IP - 2 Some terminal emulators draw bold text in bright colors instead; .IP - 2 Some terminal emulators only distinguish between bold and bright colors via a default-off switch. .TP Please consult the manual of your terminal emulator as well as \fIhttps://en.wikipedia.org/wiki/ANSI_escape_code\fR for details. .SH ENVIRONMENT .TP .BI BROWSER Overrides the default browser. Ref: .I http://docs.python.org/library/webbrowser.html .TP .BI GOOGLER_COLORS Refer to the \fBCOLORS\fR section. .TP .BI DISABLE_PROMPT_COLOR Force a plain omniprompt if you are facing issues with colors at the prompt. .TP .BI https_proxy Refer to the \fB--proxy\fR option. .TP .BI DISABLE_URL_EXPANSION Show the domain names in search results instead of the expanded URL. .SH EXAMPLES .PP .IP 1. 4 Google \fBhello world\fR: .PP .EX .IP .B googler hello world .EE .PP .IP 2. 4 Fetch \fB15 results\fR updated within the last \fB14 months\fR, starting from the \fB3rd result\fR for the keywords \fBjungle book\fR in \fBsite\fR imdb.com: .PP .EX .IP .B googler -n 15 -s 3 -t m14 -w imdb.com jungle book .EE .PP .IP 3. 4 Read recent \fBnews\fR on gadgets: .PP .EX .IP .B googler -N gadgets .EE .PP .IP 4. 4 Fetch results on IPL cricket from \fBGoogle India\fR server in \fBEnglish\fR: .PP .EX .IP .B googler -c in -l en IPL cricket .EE .PP .IP 5. 4 Search \fBquoted text\fR: .PP .EX .IP .B googler it\(rs's a \(rs\(dqbeautiful world\(rs\(dq in spring .EE .PP .IP 6. 4 Search for a \fBspecific file type\fR: .PP .EX .IP .B googler instrumental filetype:mp3 .EE .PP .IP 7. 4 Disable \fBautomatic spelling correction\fR, e.g. fetch results for \fIgoogler\fR instead of \fIgoogle\fR: .PP .EX .IP .B googler -x googler .EE .PP .IP 8. 4 \fBI'm feeling lucky\fR search: .PP .EX .IP .B googler -j leather jackets .EE .PP .IP 9. 4 \fBWebsite specific\fR search: .PP .EX .IP .B googler -w amazon.com -w ebay.com digital camera .EE .PP .IP "" 4 Site specific search continues at omniprompt. .EE .PP .IP 10. 4 Alias to find \fBdefinitions of words\fR: .PP .EX .IP .B alias define='googler -n 2 define' .EE .PP .IP 11. 4 Look up \fBn\fR, \fBp\fR, \fBo\fR, \fBO\fR, \fBq\fR, \fBg keywords\fR or a result index at the \fBomniprompt\fR: as the omniprompt recognizes these keys or index strings as commands, you need to prefix them with \fBg\fR, e.g., .PP .EX .PD 0 .IP .B g n .IP .B g g keywords .IP .B g 1 .PD .EE .PP .IP 12. 4 Input and output \fBredirection\fR: .PP .EX .IP .B googler -C hello world < input > output .EE .PP .IP "" 4 Note that \fI-C\fR is required to avoid printing control characters (for colored output). .IP 13. 4 \fBPipe\fR output: .PP .EX .IP .B googler -C hello world | tee output .EE .IP 14. 4 Use a \fBcustom color scheme\fR, e.g., one warm color scheme designed for Solarized Dark: .PP .EX .IP .B googler --colors bjdxxy google .IP .B GOOGLER_COLORS=bjdxxy googler google .EE .IP 15. 4 Tunnel traffic through an \fBHTTPS proxy\fR, e.g., a local Privoxy instance listening on port 8118: .PP .EX .IP .B googler --proxy localhost:8118 google .EE .PP .IP "" 4 By default the environment variable \fIhttps_proxy\fR is used, if defined. .IP 16. 4 Quote multiple search keywords to auto-complete (using completion script): .PP .EX .IP .B googler 'hello w .EE .SH AUTHORS Henri Hakkinen .br Arun Prakash Jana .br Zhiming Wang .SH HOME .I https://github.com/jarun/googler .SH REPORTING BUGS .I https://github.com/jarun/googler/issues .SH LICENSE Copyright \(co 2008 Henri Hakkinen .br Copyright \(co 2015-2019 Arun Prakash Jana .PP License GPLv3+: GNU GPL version 3 or later . .br This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. googler-4.0/googler.svg000066400000000000000000000527051356731672100152160ustar00rootroot00000000000000 googler-light Created with Sketch. googler-4.0/tests/000077500000000000000000000000001356731672100141705ustar00rootroot00000000000000googler-4.0/tests/googler.py000077700000000000000000000000001356731672100177672../googlerustar00rootroot00000000000000googler-4.0/tests/packagecore/000077500000000000000000000000001356731672100164345ustar00rootroot00000000000000googler-4.0/tests/packagecore/packagecore.yaml000066400000000000000000000027211356731672100215660ustar00rootroot00000000000000name: googler maintainer: Arun Prakash Jana license: GPLv3 summary: Google from the command-line. homepage: https://github.com/jarun/googler commands: install: - make PREFIX="/usr" install DESTDIR="${BP_DESTDIR}" packages: archlinux: builddeps: - make deps: - python container: "archlinux/base" centos7.5: builddeps: - make deps: - python centos7.6: builddeps: - make deps: - python debian8: builddeps: - make deps: - python3 debian9: builddeps: - make deps: - python3 debian10: builddeps: - make deps: - python3 fedora25: builddeps: - make deps: - python3 fedora26: builddeps: - make deps: - python3 fedora27: builddeps: - make deps: - python3 fedora28: builddeps: - make deps: - python3 fedora29: builddeps: - make deps: - python3 fedora30: builddeps: - make deps: - python3 fedora31: builddeps: - make deps: - python3 opensuse42.3: builddeps: - make deps: - python3 opensuse15.0: builddeps: - make deps: - python3 ubuntu14.04: builddeps: - make deps: - python3 ubuntu16.04: builddeps: - make deps: - python3 ubuntu18.04: builddeps: - make deps: - python3 googler-4.0/tests/parse000077500000000000000000000015761356731672100152410ustar00rootroot00000000000000#!/usr/bin/env python3 """Parse saved responses with GoogleParser.""" import argparse import json import googler def main(): argparser = argparse.ArgumentParser(description='Parse Google responses.') argparser.add_argument('-N', '--news', action='store_true', help='parse as Google News responses') argparser.add_argument('files', nargs='+', metavar='FILE', help="HTML file with Google's response body") args = argparser.parse_args() for fn in args.files: with open(fn, encoding='utf-8') as fp: htmlparser = googler.GoogleParser(news=args.news) htmlparser.feed(fp.read()) results_object = [r.jsonizable_object() for r in htmlparser.results] print(json.dumps(results_object, indent=2, sort_keys=True, ensure_ascii=False)) if __name__ == '__main__': main() googler-4.0/tests/test000077500000000000000000000114611356731672100151000ustar00rootroot00000000000000#!/usr/bin/env bash set -e trap 'exit 130' INT declare here googler quiet ci exitcode here="$(python3 -c 'import pathlib, sys; print(pathlib.Path(sys.argv[2]).parent.resolve())' -- "$0")" googler=$here/../googler quiet=0 ci=0 exitcode=0 while [[ $1 == -* ]]; do case $1 in --ci) ci=1 quiet=1 ;; -h|--help) cat <<'EOF' Usage: test [options] Run automated tests of googler(1). Requires shuf(1) from coreutils and /usr/share/dict/words. Options: --ci Same to --quiet, except LF is used instead of CR when printing progress information. The reason is that CI logs are typically line buffered, so CR won't flush the output, rendering progress info useless. -h, --help Print this help and exit. -q, --quiet Suppress googler's output except when a test fails. Some progress info is still printed to stderr. Note that without this option, this script is rather verbose. Environment variables: NUM_TEST_ITERATIONS Number of random tests to run. Default is 100. SLEEP_DURATION Number of seconds to sleep after each query. Default is 0. You may want to set this to avoid being blocked by Google for spamming. EOF exit 1 ;; -q|--quiet) quiet=1 ;; *) printf '\033[31mError: Unrecognized option %q.\033[0m\n' "$1" >&2 exit 1 ;; esac shift done [[ $# -gt 0 ]] && { printf '\033[31mError: Unrecognized argument %q.\033[0m\n' "$1" >&2 exit 1 } declare num_rand_words declare -a predefined_wordlist random_wordlist tld_args lang_args # A UTF-8 wordlist. predefined_wordlist=('汉语' 'español' 'português' 'ру́сский язы́к' '日本語' '한국어' 'le français') # Requires shuf(1). command -v shuf &>/dev/null || { printf '\033[31mError: shuf(1) not found.\033[0m\n' >&2 exit 1 } num_rand_words=10 random_wordlist=( $(shuf -n $num_rand_words /usr/share/dict/words 2>/dev/null) ) [[ ${#random_wordlist[@]} == $num_rand_words ]] || { printf '\033[31mError: Problem reading random words from /usr/share/dict/words.\033[0m\n' >&2 exit 1 } # Test googler with the given options, and report error if necessary. # # Whether googler's output is suppressed depends on whether the global variable # quiet is truthy (set by -q, --quiet); when a failure is encountered, the # global variable exitcode is set to 1, and if quiet was set, the test is rerun # with output turned on. test_googler () { report_error () { local last_status=$? declare -g exitcode local rerun=0 [[ $1 == --rerun ]] && { rerun=1 shift } printf '\033[31mError: googler ' >&2 printf '%q ' "$@" >&2 printf 'failed with status %d.\033[0m\n' $last_status >&2 exitcode=1 (( rerun )) && { $googler --noprompt -d "$@"; printf '\n\033[33m[Exit status] %d\033[0m\n' $?; } || : } declare -g quiet if (( quiet )); then $googler --noprompt -d "$@" &>/dev/null || report_error --rerun "$@" else printf '\033[34m==> googler ' >&2 printf '%q ' "$@" >&2 printf '\033[0m\n' >&2 $googler --noprompt -d "$@" || report_error "$@" echo fi } # Write a list of configurations to $config_list, and later randomly pick from # that list. (The reason we don't test them all is that Google would block us # after thousands of queries.) declare config_list config_list="$(mktemp)" trap 'rm -f "$config_list"' EXIT for tld in com ar au be br ca ch cz de es 'fi' fr id 'in' it jp kr mx nl ph pl pt ro ru se tw ua uk; do [[ $tld != com ]] && tld_args=(-c $tld) || tld_args=() for lang in default de en fr hi ja ko zh; do [[ $lang != default ]] && lang_args=(-l $lang) || lang_args=() # Test single word queries. for keyword in "${predefined_wordlist[@]}" "${random_wordlist[@]}"; do printf '%s ' "${tld_args[@]}" "${lang_args[@]}" "$keyword" echo done # Test double word queries. for (( i = 0; i + 1 < num_rand_words; i += 2 )); do printf '%s ' "${tld_args[@]}" "${lang_args[@]}" \ "${random_wordlist[i]}" "${random_wordlist[i+1]}" echo done done done >"$config_list" declare num_rand_configs num_rand_configs="${NUM_TEST_ITERATIONS:-100}" counter=0 while read -r args; do (( counter++ )) || : printf '\033[32mTest %d/%d\033[0m' $counter $num_rand_configs >&2 (( quiet && !ci )) && printf '\r' >&2 || printf '\n' >&2 test_googler $args # explicit word splitting here, yes sleep "${SLEEP_DURATION:-0}" done < <(shuf -n $num_rand_configs "$config_list") (( exitcode )) || printf '\033[K\033[32mAll passed.\033[0m\n' exit $exitcode