pax_global_header 0000666 0000000 0000000 00000000064 13521632563 0014520 g ustar 00root root 0000000 0000000 52 comment=bd9628df93851c2baf6316378a91a0b7dff32a22
simdjson-0.2.1/ 0000775 0000000 0000000 00000000000 13521632563 0013346 5 ustar 00root root 0000000 0000000 simdjson-0.2.1/.appveyor.yml 0000664 0000000 0000000 00000000673 13521632563 0016022 0 ustar 00root root 0000000 0000000 version: '{build}'
branches:
only:
- master
image:
- Visual Studio 2017
clone_folder: c:\projects\simdjson
platform:
- x64
environment:
matrix:
- SIMDJSON_BUILD_STATIC: "OFF"
- SIMDJSON_BUILD_STATIC: "ON"
build_script:
- mkdir build
- cd build
- ps: cmake -DSIMDJSON_BUILD_STATIC="$env:SIMDJSON_BUILD_STATIC" -DCMAKE_BUILD_TYPE=Release -DCMAKE_GENERATOR_PLATFORM=x64 ..
- cmake --build .
- ctest --verbose
simdjson-0.2.1/.circleci/ 0000775 0000000 0000000 00000000000 13521632563 0015201 5 ustar 00root root 0000000 0000000 simdjson-0.2.1/.circleci/config.yml 0000664 0000000 0000000 00000006166 13521632563 0017202 0 ustar 00root root 0000000 0000000 version: 2
jobs:
"gcc":
docker:
- image: ubuntu:18.04
environment:
CXX: g++-7
steps:
- checkout
- run: apt-get update -qq
- run: >
apt-get install -y
build-essential
cmake
g++-7
git
- run:
name: Building (gcc)
command: make
- run:
name: Running tests (gcc)
command: make quiettest amalgamate
- run:
name: Building (gcc, cmake)
command: |
mkdir build
cd build
cmake ..
make
- run:
name: Running tests (gcc, cmake)
command: |
cd build
make test
"gccnoavx":
docker:
- image: ubuntu:18.04
environment:
CXX: g++-7
steps:
- checkout
- run: apt-get update -qq
- run: >
apt-get install -y
build-essential
cmake
g++-7
git
- run:
name: Building (gcc)
command: ARCHFLAGS="-march=nehalem" make
- run:
name: Running tests (gcc)
command: ARCHFLAGS="-march=nehalem" make quiettest amalgamate
- run:
name: Building (gcc, cmake)
command: |
mkdir build
cd build
cmake -DSIMDJSON_DISABLE_AVX=on ..
make
- run:
name: Running tests (gcc, cmake)
command: |
cd build
make test
"clang":
docker:
- image: ubuntu:18.04
environment:
CXX: clang++-6.0
steps:
- checkout
- run: apt-get update -qq
- run: >
apt-get install -y
build-essential
cmake
clang-6.0
git
- run:
name: Building (clang)
command: make
- run:
name: Running tests (clang)
command: make quiettest amalgamate
- run:
name: Building (clang, cmake)
command: |
mkdir build
cd build
cmake ..
make
- run:
name: Running tests (clang, cmake)
command: |
cd build
make test
"clangnoavx":
docker:
- image: ubuntu:18.04
environment:
CXX: clang++-6.0
steps:
- checkout
- run: apt-get update -qq
- run: >
apt-get install -y
build-essential
cmake
clang-6.0
git
- run:
name: Building (clang)
command: ARCHFLAGS="-march=nehalem" make
- run:
name: Running tests (clang)
command: ARCHFLAGS="-march=nehalem" make quiettest amalgamate
- run:
name: Building (clang, cmake)
command: |
mkdir build
cd build
cmake -DSIMDJSON_DISABLE_AVX=on ..
make
- run:
name: Running tests (clang, cmake)
command: |
cd build
make test
workflows:
version: 2
build_and_test:
jobs:
- "clang"
- "gcc"
- "clangnoavx"
- "gccnoavx" simdjson-0.2.1/.clang-format 0000664 0000000 0000000 00000000023 13521632563 0015714 0 ustar 00root root 0000000 0000000 BasedOnStyle: LLVM
simdjson-0.2.1/.drone.yml 0000664 0000000 0000000 00000007553 13521632563 0015270 0 ustar 00root root 0000000 0000000 kind: pipeline
name: x64
platform:
os: linux
arch: amd64
steps:
- name: test
image: gcc:8
commands:
- make
- make quiettest
- make amalgamate
---
kind: pipeline
name: arm64
platform:
os: linux
arch: arm64
steps:
- name: test
image: gcc:8
commands:
- make
- make quiettest
- make amalgamate
---
kind: pipeline
name: stylecheck
platform:
os: linux
arch: amd64
steps:
- name: Build and Test
image: ubuntu:18.04
commands:
- apt-get update -y
- apt-get install -y python clang-format
- ./style/run-clang-format.py -r include/ benchmark/ src/ tests/
---
kind: pipeline
name: amd64_clang_cmake_dynamic
platform:
os: linux
arch: amd64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CXX: clang++
commands:
- apt-get update -y
- apt-get install -y make $CC g++ cmake
- $CC --version
- mkdir build && cd build
- cmake -DSIMDJSON_BUILD_STATIC=OFF $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: amd64_clang_cmake_static
platform:
os: linux
arch: amd64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CXX: clang++
commands:
- apt-get update -y
- apt-get install -y make $CC g++ cmake
- $CC --version
- mkdir build && cd build
- cmake -DSIMDJSON_BUILD_STATIC=ON $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: amd64_gcc_cmake_static
platform:
os: linux
arch: amd64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: gcc
CXX: g++
commands:
- apt-get update -y
- apt-get install -y make $CC g++ cmake
- $CC --version
- mkdir build && cd build
- cmake -DSIMDJSON_BUILD_STATIC=ON $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: amd64_gcc_cmake_dynamic
platform:
os: linux
arch: amd64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: gcc
CXX: g++
commands:
- apt-get update -y
- apt-get install -y make $CC g++ cmake
- $CC --version
- mkdir build && cd build
- cmake -DSIMDJSON_BUILD_STATIC=OFF $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: arm64_clang_cmake_dynamic
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CXX: clang++
commands:
- apt-get update -y
- apt-get install -y make $CC g++ cmake
- $CC --version
- mkdir build && cd build
- cmake -DSIMDJSON_BUILD_STATIC=OFF $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: arm64_gcc_cmake_dynamic
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: gcc
CXX: g++
commands:
- apt-get update -y
- apt-get install -y make $CC g++ cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest
commands:
- apt-get update -y
- apt-get install -y make $CC g++ cmake
- $CC --version
- mkdir build && cd build
- cmake -DSIMDJSON_BUILD_STATIC=OFF $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: arm64_clang_cmake_static
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CXX: clang++
commands:
- apt-get update -y
- apt-get install -y make $CC g++ cmake
- $CC --version
- mkdir build && cd build
- cmake -DSIMDJSON_BUILD_STATIC=ON $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: arm64_gcc_cmake_static
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: gcc
CXX: g++
commands:
- apt-get update -y
- apt-get install -y make $CC g++ cmake
- $CC --version
- mkdir build && cd build
- cmake -DSIMDJSON_BUILD_STATIC=ON $CMAKE_FLAGS ..
- make -j
- ctest
simdjson-0.2.1/.gitattributes 0000664 0000000 0000000 00000000745 13521632563 0016247 0 ustar 00root root 0000000 0000000 # Set the default behavior, in case people don't have core.autocrlf set.
# Uncomment next line to adjust line endings
* text=auto
# Explicitly declare text files you want to always be normalized and converted
# to native line endings on checkout.
*.c text
*.cpp text
*.h text
*.java text
*.xml text
# Denote all files that are truly binary and should not be modified.
*.png binary
*.jpg binary
*.svg binary
*.json binary
# we don't want json files to be modified for this project
simdjson-0.2.1/.gitignore 0000664 0000000 0000000 00000000006 13521632563 0015332 0 ustar 00root root 0000000 0000000 build/ simdjson-0.2.1/.gitmodules 0000664 0000000 0000000 00000002101 13521632563 0015515 0 ustar 00root root 0000000 0000000 [submodule "scalarvssimd/rapidjson"]
path = dependencies/rapidjson
url = https://github.com/Tencent/rapidjson.git
[submodule "dependencies/sajson"]
path = dependencies/sajson
url = https://github.com/chadaustin/sajson.git
[submodule "dependencies/json11"]
path = dependencies/json11
url = https://github.com/dropbox/json11.git
[submodule "dependencies/fastjson"]
path = dependencies/fastjson
url = https://github.com/mikeando/fastjson.git
[submodule "dependencies/gason"]
path = dependencies/gason
url = https://github.com/vivkin/gason.git
[submodule "dependencies/ujson4c"]
path = dependencies/ujson4c
url = https://github.com/esnme/ujson4c.git
[submodule "dependencies/jsmn"]
path = dependencies/jsmn
url = https://github.com/zserge/jsmn.git
[submodule "dependencies/cJSON"]
path = dependencies/cJSON
url = https://github.com/DaveGamble/cJSON.git
[submodule "dependencies/jsoncpp"]
path = dependencies/jsoncpp
url = https://github.com/open-source-parsers/jsoncpp.git
[submodule "dependencies/json"]
path = dependencies/json
url = https://github.com/nlohmann/json.git
simdjson-0.2.1/.travis.yml 0000664 0000000 0000000 00000001077 13521632563 0015464 0 ustar 00root root 0000000 0000000 language: cpp
sudo: false
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- gcc-7
- g++-7
- clang-format
- python
branches:
only:
- master
script:
- export CXX=g++-7
- export CC=gcc-7
- make
- make test
- make everything
- make amalgamate
- make clean
- make SANITIZEGOLD=1 test
- make clean
- ARCHFLAGS="-march=nehalem" make
- ARCHFLAGS="-march=nehalem" make test
- ARCHFLAGS="-march=nehalem" make everything
- ./style/run-clang-format.py -r include/ benchmark/ src/ tests/
simdjson-0.2.1/AUTHORS 0000664 0000000 0000000 00000000106 13521632563 0014413 0 ustar 00root root 0000000 0000000 # List of authors for copyright purposes
Daniel Lemire
Geoff Langdale
simdjson-0.2.1/CMakeLists.txt 0000664 0000000 0000000 00000004513 13521632563 0016111 0 ustar 00root root 0000000 0000000 cmake_minimum_required(VERSION 3.9) # CMP0069 NEW
include(CheckIPOSupported)
check_ipo_supported(RESULT ltoresult)
if(ltoresult)
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
endif()
# usage: cmake -DSIMDJSON_DISABLE_AVX=on ..
option(SIMDJSON_DISABLE_AVX "Forcefully disable AVX even if hardware supports it" OFF)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_MACOSX_RPATH OFF)
if (NOT CMAKE_BUILD_TYPE)
message(STATUS "No build type selected, default to Release")
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
endif()
project(simdjson)
set(SIMDJSON_LIB_NAME simdjson)
set(PROJECT_VERSION_MAJOR 0)
set(PROJECT_VERSION_MINOR 2)
set(PROJECT_VERSION_PATCH 1)
set(SIMDJSON_LIB_VERSION "0.2.1" CACHE STRING "simdjson library version")
set(SIMDJSON_LIB_SOVERSION "0" CACHE STRING "simdjson library soversion")
if(NOT MSVC)
option(SIMDJSON_BUILD_STATIC "Build a static library" OFF) # turning it on disables the production of a dynamic library
else()
option(SIMDJSON_BUILD_STATIC "Build a static library" ON) # turning it on disables the production of a dynamic library
endif()
option(SIMDJSON_SANITIZE "Sanitize addresses" OFF)
set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/tools/cmake")
find_package(CTargets)
find_package(Options)
install(DIRECTORY include/${SIMDJSON_LIB_NAME} DESTINATION include)
set (TEST_DATA_DIR "${CMAKE_CURRENT_SOURCE_DIR}/jsonchecker/")
set (BENCHMARK_DATA_DIR "${CMAKE_CURRENT_SOURCE_DIR}/jsonexamples/")
add_definitions(-DSIMDJSON_TEST_DATA_DIR="${TEST_DATA_DIR}")
add_definitions(-DSIMDJSON_BENCHMARK_DATA_DIR="${TEST_DATA_DIR}")
enable_testing()
add_subdirectory(src)
add_subdirectory(tools)
add_subdirectory(tests)
add_subdirectory(benchmark)
set(CPACK_PACKAGE_VENDOR "Daniel Lemire")
set(CPACK_PACKAGE_CONTACT "lemire@gmail.com")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Parsing gigabytes of JSON per second")
set(CPACK_PACKAGE_VERSION_MAJOR ${PROJECT_VERSION_MAJOR})
set(CPACK_PACKAGE_VERSION_MINOR ${PROJECT_VERSION_MINOR})
set(CPACK_PACKAGE_VERSION_PATCH ${PROJECT_VERSION_PATCH})
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.md")
set(CPACK_RPM_PACKAGE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
set(CPACK_SOURCE_GENERATOR "TGZ;ZIP")
include(CPack)
simdjson-0.2.1/CONTRIBUTORS 0000664 0000000 0000000 00000000733 13521632563 0015231 0 ustar 00root root 0000000 0000000 # contributors (in no particular order)
Thomas Navennec
Kai Wolf
Tyler Kennedy
Frank Wessels
George Fotopoulos
Heinz N. Gies
Emil Gedda
Wojciech Muła
Georgios Floros
Dong Xie
Nan Xiao
Egor Bogatov
Jinxi Wang
Luiz Fernando Peres
Wouter Bolsterlee
Anish Karandikar
Reini Urban
Tom Dyson
Ihor Dotsenko
Alexey Milovidov
Chang Liu
Sunny Gleason
John Keiser
Zach Bjornson
# if you have contributed to the project and your name does not
# appear in this list, please let us know!
simdjson-0.2.1/Dockerfile 0000664 0000000 0000000 00000000362 13521632563 0015341 0 ustar 00root root 0000000 0000000 # docker build -t simdjson . && docker run --privileged -t simdjson
FROM gcc:8.3
COPY . /usr/src/
WORKDIR /usr/src/
RUN make clean
RUN make amalgamate
RUN make
RUN make test
RUN make parsingcompetition
CMD ["bash", "scripts/selectparser.sh"]
simdjson-0.2.1/LICENSE 0000664 0000000 0000000 00000026134 13521632563 0014361 0 ustar 00root root 0000000 0000000 Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2018-2019 The simdjson authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
simdjson-0.2.1/Makefile 0000664 0000000 0000000 00000023536 13521632563 0015017 0 ustar 00root root 0000000 0000000
.SUFFIXES:
#
.SUFFIXES: .cpp .o .c .h
.PHONY: clean cleandist
COREDEPSINCLUDE = -Idependencies/json/single_include -Idependencies/rapidjson/include -Idependencies/sajson/include -Idependencies/cJSON -Idependencies/jsmn
EXTRADEPSINCLUDE = -Idependencies/jsoncppdist -Idependencies/json11 -Idependencies/fastjson/src -Idependencies/fastjson/include -Idependencies/gason/src -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src
# users can provide their own additional flags with make EXTRAFLAGS=something
architecture:=$(shell arch)
####
# If you want to specify your own target architecture,
# then define ARCHFLAGS. Otherwise, we set good default.
# E.g., type ' ARCHFLAGS="-march=nehalem" make parse '
###
ifeq ($(architecture),aarch64)
ARCHFLAGS ?= -march=armv8-a+crc+crypto
else
ARCHFLAGS ?= -msse4.2 -mpclmul # lowest supported feature set?
endif
CXXFLAGS = $(ARCHFLAGS) -std=c++17 -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux $(EXTRAFLAGS)
CFLAGS = $(ARCHFLAGS) -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src $(EXTRAFLAGS)
# This is a convenience flag
ifdef SANITIZEGOLD
SANITIZE = 1
LINKER = gold
endif
ifdef LINKER
CXXFLAGS += -fuse-ld=$(LINKER)
CFLAGS += -fuse-ld=$(LINKER)
endif
# SANITIZE *implies* DEBUG
ifeq ($(MEMSANITIZE),1)
CXXFLAGS += -g3 -O0 -fsanitize=memory -fno-omit-frame-pointer -fsanitize=undefined
CFLAGS += -g3 -O0 -fsanitize=memory -fno-omit-frame-pointer -fsanitize=undefined
else
ifeq ($(SANITIZE),1)
CXXFLAGS += -g3 -O0 -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined
CFLAGS += -g3 -O0 -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined
else
ifeq ($(DEBUG),1)
CXXFLAGS += -g3 -O0
CFLAGS += -g3 -O0
else
# we opt for -O3 for regular builds
CXXFLAGS += -O3
CFLAGS += -O3
endif # ifeq ($(DEBUG),1)
endif # ifeq ($(SANITIZE),1)
endif # ifeq ($(MEMSANITIZE),1)
MAINEXECUTABLES=parse minify json2json jsonstats statisticalmodel jsonpointer
TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck pointercheck
COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompetition distinctuseridcompetition allparserscheckfile allparsingcompetition
SUPPLEMENTARYEXECUTABLES=parse_noutf8validation parse_nonumberparsing parse_nostringparsing
HEADERS= include/simdjson/simdutf8check_haswell.h include/simdjson/simdutf8check_westmere.h include/simdjson/simdutf8check_arm64.h include/simdjson/stringparsing.h include/simdjson/stringparsing_arm64.h include/simdjson/stringparsing_haswell.h include/simdjson/stringparsing_macros.h include/simdjson/stringparsing_westmere.h include/simdjson/numberparsing.h include/simdjson/jsonparser.h include/simdjson/common_defs.h include/simdjson/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/simdjson/parsedjson.h include/simdjson/stage1_find_marks.h include/simdjson/stage1_find_marks_arm64.h include/simdjson/stage1_find_marks_haswell.h include/simdjson/stage1_find_marks_westmere.h include/simdjson/stage1_find_marks_macros.h include/simdjson/stage2_build_tape.h include/simdjson/jsoncharutils.h include/simdjson/jsonformatutils.h include/simdjson/stage1_find_marks_flatten.h include/simdjson/stage1_find_marks_flatten_haswell.h
LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/parsedjson.cpp src/parsedjsoniterator.cpp
MINIFIERHEADERS=include/simdjson/jsonminifier.h include/simdjson/simdprune_tables.h
MINIFIERLIBFILES=src/jsonminifier.cpp
RAPIDJSON_INCLUDE:=dependencies/rapidjson/include
SAJSON_INCLUDE:=dependencies/sajson/include
JSON11_INCLUDE:=dependencies/json11/json11.hpp
FASTJSON_INCLUDE:=dependencies/include/fastjson/fastjson.h
GASON_INCLUDE:=dependencies/gason/src/gason.h
UJSON4C_INCLUDE:=dependencies/ujson4c/src/ujdecode.c
CJSON_INCLUDE:=dependencies/cJSON/cJSON.h
JSMN_INCLUDE:=dependencies/jsmn/jsmn.h
JSON_INCLUDE:=dependencies/json/single_include/nlohmann/json.hpp
LIBS=$(RAPIDJSON_INCLUDE) $(JSON_INCLUDE) $(SAJSON_INCLUDE) $(JSON11_INCLUDE) $(FASTJSON_INCLUDE) $(GASON_INCLUDE) $(UJSON4C_INCLUDE) $(CJSON_INCLUDE) $(JSMN_INCLUDE)
EXTRAOBJECTS=ujdecode.o
all: $(MAINEXECUTABLES)
competition: $(COMPARISONEXECUTABLES)
.PHONY: benchmark test
benchmark:
bash ./scripts/parser.sh
bash ./scripts/parseandstat.sh
test: jsoncheck numberparsingcheck stringparsingcheck basictests allparserscheckfile minify json2json pointercheck
./basictests
./numberparsingcheck
./stringparsingcheck
./jsoncheck
./pointercheck
./scripts/testjson2json.sh
./scripts/issue150.sh
@echo "It looks like the code is good!"
quiettest: jsoncheck numberparsingcheck stringparsingcheck basictests allparserscheckfile minify json2json pointercheck
./basictests
./numberparsingcheck
./stringparsingcheck
./jsoncheck
./pointercheck
./scripts/testjson2json.sh
./scripts/issue150.sh
amalgamate:
./amalgamation.sh
$(CXX) $(CXXFLAGS) -o singleheader/demo ./singleheader/amalgamation_demo.cpp -Isingleheader
submodules:
-git submodule update --init --recursive
-touch submodules
$(JSON_INCLUDE) $(SAJSON_INCLUDE) $(RAPIDJSON_INCLUDE) $(JSON11_INCLUDE) $(FASTJSON_INCLUDE) $(GASON_INCLUDE) $(UJSON4C_INCLUDE) $(CJSON_INCLUDE) $(JSMN_INCLUDE) : submodules
parse: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o parse $(LIBFILES) benchmark/parse.cpp $(LIBFLAGS)
statisticalmodel: benchmark/statisticalmodel.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o statisticalmodel $(LIBFILES) benchmark/statisticalmodel.cpp $(LIBFLAGS)
parse_noutf8validation: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o parse_noutf8validation -DSIMDJSON_SKIPUTF8VALIDATION $(LIBFILES) benchmark/parse.cpp $(LIBFLAGS)
parse_nonumberparsing: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o parse_nonumberparsing -DSIMDJSON_SKIPNUMBERPARSING $(LIBFILES) benchmark/parse.cpp $(LIBFLAGS)
parse_nostringparsing: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o parse_nostringparsing -DSIMDJSON_SKIPSTRINGPARSING $(LIBFILES) benchmark/parse.cpp $(LIBFLAGS)
jsoncheck:tests/jsoncheck.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o jsoncheck $(LIBFILES) tests/jsoncheck.cpp -I. $(LIBFLAGS)
basictests:tests/basictests.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o basictests $(LIBFILES) tests/basictests.cpp -I. $(LIBFLAGS)
numberparsingcheck:tests/numberparsingcheck.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o numberparsingcheck tests/numberparsingcheck.cpp src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/parsedjson.cpp -I. $(LIBFLAGS) -DJSON_TEST_NUMBERS
stringparsingcheck:tests/stringparsingcheck.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o stringparsingcheck tests/stringparsingcheck.cpp src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/parsedjson.cpp -I. $(LIBFLAGS) -DJSON_TEST_STRINGS
pointercheck:tests/pointercheck.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o pointercheck tests/pointercheck.cpp src/stage2_build_tape.cpp src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/parsedjson.cpp src/parsedjsoniterator.cpp -I. $(LIBFLAGS)
minifiercompetition: benchmark/minifiercompetition.cpp $(HEADERS) submodules $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIBFILES)
$(CXX) $(CXXFLAGS) -o minifiercompetition $(LIBFILES) $(MINIFIERLIBFILES) benchmark/minifiercompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
minify: tools/minify.cpp $(HEADERS) $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIBFILES)
$(CXX) $(CXXFLAGS) -o minify $(MINIFIERLIBFILES) $(LIBFILES) tools/minify.cpp -I.
json2json: tools/json2json.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o json2json $ tools/json2json.cpp $(LIBFILES) -I.
jsonpointer: tools/jsonpointer.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o jsonpointer $ tools/jsonpointer.cpp $(LIBFILES) -I.
jsonstats: tools/jsonstats.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o jsonstats $ tools/jsonstats.cpp $(LIBFILES) -I.
ujdecode.o: $(UJSON4C_INCLUDE)
$(CC) $(CFLAGS) -c dependencies/ujson4c/src/ujdecode.c
parseandstatcompetition: benchmark/parseandstatcompetition.cpp $(HEADERS) $(LIBFILES) submodules
$(CXX) $(CXXFLAGS) -o parseandstatcompetition $(LIBFILES) benchmark/parseandstatcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
distinctuseridcompetition: benchmark/distinctuseridcompetition.cpp $(HEADERS) $(LIBFILES) submodules
$(CXX) $(CXXFLAGS) -o distinctuseridcompetition $(LIBFILES) benchmark/distinctuseridcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) submodules
@echo "In case of build error due to missing files, try 'make clean'"
$(CXX) $(CXXFLAGS) -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
allparsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS) submodules
$(CXX) $(CXXFLAGS) -o allparsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp $(EXTRAOBJECTS) -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(EXTRADEPSINCLUDE) -DALLPARSER
allparserscheckfile: tests/allparserscheckfile.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS) submodules
$(CXX) $(CXXFLAGS) -o allparserscheckfile $(LIBFILES) tests/allparserscheckfile.cpp $(EXTRAOBJECTS) -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(EXTRADEPSINCLUDE)
.PHONY: clean cppcheck cleandist
cppcheck:
cppcheck --enable=all src/*.cpp benchmarks/*.cpp tests/*.cpp -Iinclude -I. -Ibenchmark/linux
everything: $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES) $(SUPPLEMENTARYEXECUTABLES)
clean:
rm -f submodules $(EXTRAOBJECTS) $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES) $(SUPPLEMENTARYEXECUTABLES)
cleandist:
rm -f submodules $(EXTRAOBJECTS) $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES) $(SUPPLEMENTARYEXECUTABLES)
simdjson-0.2.1/Notes.md 0000664 0000000 0000000 00000014762 13521632563 0014772 0 ustar 00root root 0000000 0000000 # Notes on simdjson
## Rationale:
The simdjson project serves two purposes:
1. It creates a useful library for parsing JSON data quickly.
2. It is a demonstration of the use of SIMD and pipelined programming techniques to perform a complex and irregular task.
These techniques include the use of large registers and SIMD instructions to process large amounts of input data at once,
to hold larger entities than can typically be held in a single General Purpose Register (GPR), and to perform operations
that are not cheap to perform without use of a SIMD unit (for example table lookup using permute instructions).
The other key technique is that the system is designed to minimize the number of unpredictable branches that must be taken
to perform the task. Modern architectures are both wide and deep (4-wide pipelines with ~14 stages are commonplace). A
recent Intel Architecture processor, for example, can perform 3 256-bit SIMD operations or 2 512-bit SIMD operations per
cycle as well as other operations on general purpose registers or with the load/store unit. An incorrectly predicted branch
will clear this pipeline. While it is rare that a programmer can achieve the maximum throughput on a machine, a developer
may be missing the opportunity to carry out 56 operations for each branch miss.
Many code-bases make use of SIMD and deeply pipelined, "non-branchy", processing for regular tasks. Numerical problems
(e.g. "matrix multiply") or simple 'bulk search' tasks (e.g. "count all the occurrences of a given character in a text",
"find the first occurrence of the string 'foo' in a text") frequently use this class of techniques. We are demonstrating
that these techniques can be applied to much more complex and less regular tasks.
## Design:
### Stage 1: SIMD over bytes; bit vector processing over bytes.
The first stage of our processing must identify key points in our input: the 'structural characters' of JSON (curly and
square braces, colon, and comma), the start and end of strings as delineated by double quote characters, other JSON 'atoms'
that are not distinguishable by simple characters (constructs such as "true", "false", "null" and numbers), as well as
discovering these characters and atoms in the presence of both quoting conventions and backslash escaping conventions.
As such we follow the broad outline of the construction of a structural index as set forth in the Mison paper [XXX]; first,
the discovery of odd-length sequences of backslash characters (which will cause quote characters immediately following to
be escaped and not serve their quoting role but instead be literal charaters), second, the discovery of quote pairs (which
cause structural characters within the quote pairs to also be merely literal characters and have no function as structural
characters), then finally the discovery of structural characters not contained without the quote pairs.
We depart from the Mison paper in terms of method and overall design. In terms of method, the Mison paper uses iteration
over bit vectors to discover backslash sequences and quote pairs; we introduce branch-free techniques to discover both of
these properties.
We also make use of our ability to quickly detect whitespace in this early stage. We can use another bit-vector based
transformation to discover locations in our data that follow a structural character or quote or whitespace and are not whitespace. Excluding locations within strings, and the structural characters we have already discovered,
these locations are the only place that we can expect to see the starts of the JSON 'atoms'. These locations are thus
treated as 'structural' ('pseudo-structural characters').
This stage involves either SIMD processing over bytes or the manipulation of bit arrays that have 1 bit corresponding
to 1 byte of input. As such, it can be quite inefficient for some inputs - it is possible to observe dozens of operations
taking place to discover that there are in fact no odd-numbered sequences of backslashes or quotes in a given block of
input. However, this inefficiency on such inputs is balanced by the fact that it costs no more to run this code over
complex structured input, and the alternatives would generally involve running a number of unpredictable branches (for
example, the loop branches in Mison that iterate over bit vectors).
### Stage 2: The transition from "SIMD over bytes" to "indices"
Our structural, pseudo-structural and other 'interesting' characters are relatively rare (TODO: quantify in detail -
it's typically about 1 in 10). As such, continuing to process them as bit vectors will involve manipulating data structures
that are relatively large as well as being fairly unpredictably spaced. We must transform these bitvectors of "interesting"
locations into offsets.
Note that we can examine the character at the offset to discover what the original function of the item in the bitvector
was. While the JSON structural characters and quotes are relatively self-explanatory (although working only with one offset
at a time, we have lost the distinction between opening quotes and closing quotes, something that was available in Stage 1),
it is a quirk of JSON that the legal atoms can all be distinguished from each other by their first character - 't' for
'true', 'f' for 'false', 'n' for 'null' and the character class [0-9-] for numerical values.
Thus, the offset suffices, as long as we retain our original input.
Our current implementation involves a straightforward transformation of bitmaps to indices by use of the 'count trailing
zeros' operation and the well-known operation to clear the lowest set bit. Note that this implementation introduces an
unpredictable branch; unless there is a regular pattern in our bitmaps, we would expect to have at least one branch miss
for each bitmap.
### Stage 3: Operation over indices
This now works over a dual structure.
1. The "state machine", whose role it is to validate the sequence of structural characters and ensure that the input is at least generally structured like valid JSON (after this stage, the only errors permissible should be malformed atoms and numbers). If and only if the "state machine" reached all accept states, then,
2. The "tape machine" will have produced valid output. The tape machine works blindly over characters writing records to tapes. These records create a lean but somewhat traversable linked structure that, for valid inputs, should represent what we need to know about the JSON input.
FIXME: a lot more detail is required on the operation of both these machines.
simdjson-0.2.1/README.md 0000664 0000000 0000000 00000066617 13521632563 0014645 0 ustar 00root root 0000000 0000000 # simdjson : Parsing gigabytes of JSON per second
[](https://cloud.drone.io/lemire/simdjson/)
[](https://circleci.com/gh/lemire/simdjson)
[](https://ci.appveyor.com/project/lemire/simdjson)
[![][license img]][license]
[](https://lgtm.com/projects/g/lemire/simdjson/context:cpp)
## A C++ library to see how fast we can parse JSON with complete validation.
JSON documents are everywhere on the Internet. Servers spend a lot of time parsing these documents. We want to accelerate the parsing of JSON per se using commonly available SIMD instructions as much as possible while doing full validation (including character encoding).
## Real-world usage
- [Microsoft FishStore](https://github.com/microsoft/FishStore)
- [Yandex ClickHouse](https://github.com/yandex/ClickHouse)
## Paper
A description of the design and implementation of simdjson appears at https://arxiv.org/abs/1902.08318 and an informal blog post providing some background and context is at https://branchfree.org/2019/02/25/paper-parsing-gigabytes-of-json-per-second/.
Some people [enjoy reading our paper](https://arxiv.org/abs/1902.08318):
[
](https://twitter.com/halvarflake/status/1118459536686362625)
## Performance results
simdjson uses three-quarters less instructions than state-of-the-art parser RapidJSON and fifty percent less than sajson. To our knowledge, simdjson is the first fully-validating JSON parser to run at gigabytes per second on commodity processors.
On a Skylake processor, the parsing speeds (in GB/s) of various processors on the twitter.json file are as follows.
| parser | GB/s |
| ------------------------------------- | ---- |
| simdjson | 2.2 |
| RapidJSON encoding-validation | 0.51 |
| RapidJSON encoding-validation, insitu | 0.71 |
| sajson (insitu, dynamic) | 0.70 |
| sajson (insitu, static) | 0.97 |
| dropbox | 0.14 |
| fastjson | 0.26 |
| gason | 0.85 |
| ultrajson | 0.42 |
| jsmn | 0.28 |
| cJSON | 0.34 |
| JSON for Modern C++ (nlohmann/json) | 0.10 |
## Requirements
- We support platforms like Linux or macOS, as well as Windows through Visual Studio 2017 or later.
- A processor with
- AVX2 (i.e., Intel processors starting with the Haswell microarchitecture released 2013 and AMD processors starting with the Zen microarchitecture released 2017),
- or SSE 4.2 and CLMUL (i.e., Intel processors going back to Westmere released in 2010 or AMD processors starting with the Jaguar used in the PS4 and XBox One)
- or a 64-bit ARM processor (ARMv8-A): this covers a wide range of mobile processors, including all Apple processors currently available for sale, going back as far back as the iPhone 5s (2013).
- A recent C++ compiler (e.g., GNU GCC or LLVM CLANG or Visual Studio 2017), we assume C++17. GNU GCC 7 or better or LLVM's clang 6 or better.
- Some benchmark scripts assume bash and other common utilities, but they are optional.
## License
This code is made available under the Apache License 2.0.
Under Windows, we build some tools using the windows/dirent_portable.h file (which is outside our library code): it under the liberal (business-friendly) MIT license.
## Code usage and example
The main API involves populating a `ParsedJson` object which hosts a fully navigable document-object-model (DOM) view of the JSON document. The DOM can be accessed using [JSON Pointer](https://tools.ietf.org/html/rfc6901) paths, for example. The main function is `json_parse` which takes a string containing the JSON document as well as a reference to pre-allocated `ParsedJson` object (which can be reused multiple time). Once you have populated the `ParsedJson` object you can navigate through the DOM with an iterator (e.g., created by `ParsedJson::Iterator pjh(pj)`, see 'Navigating the parsed document').
```C
#include "simdjson/jsonparser.h"
using namespace simdjson;
/...
const char * filename = ... //
// use whatever means you want to get a string (UTF-8) of your JSON document
padded_string p = get_corpus(filename);
ParsedJson pj;
pj.allocate_capacity(p.size()); // allocate memory for parsing up to p.size() bytes
const int res = json_parse(p, pj); // do the parsing, return 0 on success
// parsing is done!
if (res != 0) {
// You can use the "simdjson/simdjson.h" header to access the error message
std::cout << "Error parsing:" << simdjson::error_message(res) << std::endl;
}
// the ParsedJson document can be used here
// pj can be reused with other json_parse calls.
```
It is also possible to use a simpler API if you do not mind having the overhead
of memory allocation with each new JSON document:
```C
#include "simdjson/jsonparser.h"
using namespace simdjson;
/...
const char * filename = ... //
padded_string p = get_corpus(filename);
ParsedJson pj = build_parsed_json(p); // do the parsing
if( ! pj.is_valid() ) {
// something went wrong
std::cout << pj.get_error_message() << std::endl;
}
```
Though the `padded_string` class is recommended for best performance, you can call `json_parse` and `build_parsed_json`, passing a standard `std::string` object.
```C
#include "simdjson/jsonparser.h"
using namespace simdjson;
/...
std::string mystring = ... //
ParsedJson pj;
pj.allocate_capacity(mystring.size()); // allocate memory for parsing up to p.size() bytes
// std::string may not overallocate so a copy will be needed
const int res = json_parse(mystring, pj); // do the parsing, return 0 on success
// parsing is done!
if (res != 0) {
// You can use the "simdjson/simdjson.h" header to access the error message
std::cout << "Error parsing:" << simdjson::error_message(res) << std::endl;
}
// pj can be reused with other json_parse calls.
```
or
```C
#include "simdjson/jsonparser.h"
using namespace simdjson;
/...
std::string mystring = ... //
// std::string may not overallocate so a copy will be needed
ParsedJson pj = build_parsed_json(mystring); // do the parsing
if( ! pj.is_valid() ) {
// something went wrong
std::cout << pj.get_error_message() << std::endl;
}
```
As needed, the `json_parse` and `build_parsed_json` functions copy the input data to a temporary buffer readable up to SIMDJSON_PADDING bytes beyond the end of the data.
## Usage: easy single-header version
See the "singleheader" repository for a single header version. See the included
file "amalgamation_demo.cpp" for usage. This requires no specific build system: just
copy the files in your project in your include path. You can then include them quite simply:
```C
#include
#include "simdjson.h"
#include "simdjson.cpp"
using namespace simdjson;
int main(int argc, char *argv[]) {
const char * filename = argv[1];
padded_string p = get_corpus(filename);
ParsedJson pj = build_parsed_json(p); // do the parsing
if( ! pj.is_valid() ) {
std::cout << "not valid" << std::endl;
std::cout << pj.get_error_message() << std::endl;
} else {
std::cout << "valid" << std::endl;
}
return EXIT_SUCCESS;
}
```
Note: In some settings, it might be desirable to precompile `simdjson.cpp` instead of including it.
## Runtime dispatch
On Intel and AMD processors, we get best performance by using the hardware support for AVX2 instructions. However, simdjson also
runs on older Intel and AMD processors. We require a minimum feature support of SSE 4.2 and CLMUL (2010 Intel Westmere or better).
The code automatically detects the feature set of your processor and switches to the right function at runtime (a technical
sometimes called runtime dispatch).
We also support 64-bit ARM. We assume NEON support, and if the cryptographic extension is available, we leverage it, at compile-time.
There is no runtime dispatch on ARM.
## Thread safety
The simdjson library is single-threaded and thread safety is the responsability of the caller. If you are on an x64 processor, the runtime dispatching assigns the right code path the firs time that parsing is attempted. For safety, you should always call json_parse at least once in a single-threaded context.
## Usage (old-school Makefile on platforms like Linux or macOS)
Requirements: recent clang or gcc, and make. We recommend at least GNU GCC/G++ 7 or LLVM clang 6. A system like Linux or macOS is expected.
To test:
```
make
make test
```
To run benchmarks:
```
make parse
./parse jsonexamples/twitter.json
```
Under Linux, the `parse` command gives a detailed analysis of the performance counters.
To run comparative benchmarks (with other parsers):
```
make benchmark
```
## Usage (CMake on platforms like Linux or macOS)
Requirements: We require a recent version of cmake. On macOS, the easiest way to install cmake might be to use [brew](https://brew.sh) and then type
```
brew install cmake
```
There is an [equivalent brew on Linux which works the same way as well](https://linuxbrew.sh).
You need a recent compiler like clang or gcc. We recommend at least GNU GCC/G++ 7 or LLVM clang 6. For example, you can install a recent compiler with brew:
```
brew install gcc@8
```
Optional: You need to tell cmake which compiler you wish to use by setting the CC and CXX variables. Under bash, you can do so with commands such as `export CC=gcc-7` and `export CXX=g++-7`.
Building: While in the project repository, do the following:
```
mkdir build
cd build
cmake ..
make
make test
```
CMake will build a library. By default, it builds a shared library (e.g., libsimdjson.so on Linux).
You can build a static library:
```
mkdir buildstatic
cd buildstatic
cmake -DSIMDJSON_BUILD_STATIC=ON ..
make
make test
```
In some cases, you may want to specify your compiler, especially if the default compiler on your system is too old. You may proceed as follows:
```
brew install gcc@8
mkdir build
cd build
export CXX=g++-8 CC=gcc-8
cmake ..
make
make test
```
## Usage (CMake on Windows using Visual Studio)
We assume you have a common Windows PC with at least Visual Studio 2017 and an x64 processor with AVX2 support (2013 Intel Haswell or later) or SSE 4.2 + CLMUL (2010 Westmere or later).
- Grab the simdjson code from GitHub, e.g., by cloning it using [GitHub Desktop](https://desktop.github.com/).
- Install [CMake](https://cmake.org/download/). When you install it, make sure to ask that `cmake` be made available from the command line. Please choose a recent version of cmake.
- Create a subdirectory within simdjson, such as `VisualStudio`.
- Using a shell, go to this newly created directory.
- Type `cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..` in the shell while in the `VisualStudio` repository. (Alternatively, if you want to build a DLL, you may use the command line `cmake -DCMAKE_GENERATOR_PLATFORM=x64 -DSIMDJSON_BUILD_STATIC=OFF ..`.)
- This last command (`cmake ...`) created a Visual Studio solution file in the newly created directory (e.g., `simdjson.sln`). Open this file in Visual Studio. You should now be able to build the project and run the tests. For example, in the `Solution Explorer` window (available from the `View` menu), right-click `ALL_BUILD` and select `Build`. To test the code, still in the `Solution Explorer` window, select `RUN_TESTS` and select `Build`.
## Usage (Using `vcpkg` on Windows, Linux and MacOS)
[vcpkg](https://github.com/Microsoft/vcpkg) users on Windows, Linux and MacOS can download and install `simdjson` with one single command from their favorite shell.
On Linux and MacOS:
```
$ ./vcpkg install simdjson
```
will build and install `simdjson` as a static library.
On Windows (64-bit):
```
.\vcpkg.exe install simdjson:x64-windows
```
will build and install `simdjson` as a shared library.
```
.\vcpkg.exe install simdjson:x64-windows-static
```
will build and install `simdjson` as a static library.
These commands will also print out instructions on how to use the library from MSBuild or CMake-based projects.
If you find the version of `simdjson` shipped with `vcpkg` is out-of-date, feel free to report it to `vcpkg` community either by submiting an issue or by creating a PR.
## Tools
- `json2json mydoc.json` parses the document, constructs a model and then dumps back the result to standard output.
- `json2json -d mydoc.json` parses the document, constructs a model and then dumps model (as a tape) to standard output. The tape format is described in the accompanying file `tape.md`.
- `minify mydoc.json` minifies the JSON document, outputting the result to standard output. Minifying means to remove the unneeded white space characters.
- `jsonpointer mydoc.json ... ` parses the document, constructs a model and then processes a series of [JSON Pointer paths](https://tools.ietf.org/html/rfc6901). The result is itself a JSON document.
## Scope
We provide a fast parser, that fully validates an input according to various specifications.
The parser builds a useful immutable (read-only) DOM (document-object model) which can be later accessed.
To simplify the engineering, we make some assumptions.
- We support UTF-8 (and thus ASCII), nothing else (no Latin, no UTF-16). We do not believe this is a genuine limitation, because we do not think there is any serious application that needs to process JSON data without an ASCII or UTF-8 encoding. If the UTF-8 contains a leading BOM, it should be omitted: the user is responsible for detecting and skipping the BOM; UTF-8 BOMs are discouraged.
- All strings in the JSON document may have up to 4294967295 bytes in UTF-8 (4GB). To enforce this constraint, we refuse to parse a document that contains more than 4294967295 bytes (4GB). This should accommodate most JSON documents.
- As allowed by the specification, we allow repeated keys within an object (other parsers like sajson do the same).
- Performance is optimized for JSON documents spanning at least a tens kilobytes up to many megabytes: the performance issues with having to parse many tiny JSON documents or one truly enormous JSON document are different.
_We do not aim to provide a general-purpose JSON library._ A library like RapidJSON offers much more than just parsing, it helps you generate JSON and offers various other convenient functions. We merely parse the document.
## Features
- The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.)
- We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers in [-9223372036854775808,9223372036854775808), like a Java `long` or a C/C++ `long long`. Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson rejects JSON files with integers larger than or equal to 2147483648. RapidJSON will parse a file containing an overly long integer like 18446744073709551616 as a floating-point number.) When we cannot represent exactly an integer as a signed 64-bit value, we reject the JSON document.
- We support the full range of 64-bit floating-point numbers (binary64). The values range from ` std::numeric_limits::lowest()` to `std::numeric_limits::max()`, so from -1.7976e308 all the way to 1.7975e308. Extreme values (less or equal to -1e308, greater or equal to 1e308) are rejected: we refuse to parse the input document.
- We test for accurate float parsing with a bound on the [unit of least precision (ULP)](https://en.wikipedia.org/wiki/Unit_in_the_last_place) of one. Practically speaking, this implies 15 digits of accuracy or better.
- We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation. The sajson parser does incomplete UTF-8 validation, accepting code point
sequences like 0xb1 0x87.)
- We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.)
- We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tabs in strings.)
- We fully validate the white-space characters outside of the strings. Parsers like RapidJSON will accept JSON documents with null characters outside of strings.
## Architecture
The parser works in two stages:
- Stage 1. (Find marks) Identifies quickly structure elements, strings, and so forth. We validate UTF-8 encoding at that stage.
- Stage 2. (Structure building) Involves constructing a "tree" of sort (materialized as a tape) to navigate through the data. Strings and numbers are parsed at this stage.
## JSON Pointer
We can navigate the parsed JSON using JSON Pointers as per the [RFC6901 standard](https://tools.ietf.org/html/rfc6901).
You can build a tool (jsonpointer) to parse a JSON document and then issue an array of JSON Pointer queries:
```
make jsonpointer
./jsonpointer jsonexamples/small/demo.json /Image/Width /Image/Height /Image/IDs/2
./jsonpointer jsonexamples/twitter.json /statuses/0/id /statuses/1/id /statuses/2/id /statuses/3/id /statuses/4/id /statuses/5/id
```
In C++, given a `ParsedJson`, we can move to a node with the `move_to` method, passing a `std::string` representing the JSON Pointer query.
## Navigating the parsed document
Here is a code sample to dump back the parsed JSON to a string:
```c
ParsedJson::Iterator pjh(pj);
if (!pjh.is_ok()) {
std::cerr << " Could not iterate parsed result. " << std::endl;
return EXIT_FAILURE;
}
compute_dump(pj);
//
// where compute_dump is :
void compute_dump(ParsedJson::Iterator &pjh) {
if (pjh.is_object()) {
std::cout << "{";
if (pjh.down()) {
pjh.print(std::cout); // must be a string
std::cout << ":";
pjh.next();
compute_dump(pjh); // let us recurse
while (pjh.next()) {
std::cout << ",";
pjh.print(std::cout);
std::cout << ":";
pjh.next();
compute_dump(pjh); // let us recurse
}
pjh.up();
}
std::cout << "}";
} else if (pjh.is_array()) {
std::cout << "[";
if (pjh.down()) {
compute_dump(pjh); // let us recurse
while (pjh.next()) {
std::cout << ",";
compute_dump(pjh); // let us recurse
}
pjh.up();
}
std::cout << "]";
} else {
pjh.print(std::cout); // just print the lone value
}
}
```
The following function will find all user.id integers:
```C
void simdjson_scan(std::vector &answer, ParsedJson::Iterator &i) {
while(i.move_forward()) {
if(i.get_scope_type() == '{') {
bool found_user = (i.get_string_length() == 4) && (memcmp(i.get_string(), "user", 4) == 0);
i.move_to_value();
if(found_user) {
if(i.is_object() && i.move_to_key("id",2)) {
if (i.is_integer()) {
answer.push_back(i.get_integer());
}
i.up();
}
}
}
}
}
```
## In-depth comparisons
If you want to see how a wide range of parsers validate a given JSON file:
```
make allparserscheckfile
./allparserscheckfile myfile.json
```
For performance comparisons:
```
make parsingcompetition
./parsingcompetition myfile.json
```
For broader comparisons:
```
make allparsingcompetition
./allparsingcompetition myfile.json
```
Both the `parsingcompetition` and `allparsingcompetition` tools take a `-t` flag which produces
a table-oriented output that can be conventiently parsed by other tools.
## Docker
One can run tests and benchmarks using docker. It especially makes sense under Linux. A privileged access may be needed to get performance counters.
```
git clone https://github.com/lemire/simdjson.git
cd simdjson
docker build -t simdjson .
docker run --privileged -t simdjson
```
## Other programming languages
We distinguish between "bindings" (which just wrap the C++ code) and a port to another programming language (which reimplements everything).
- [pysimdjson](https://github.com/TkTech/pysimdjson): Python bindings for the simdjson project.
- [simdjson-rs](https://github.com/Licenser/simdjson-rs): Rust port
- [simdjson-rust](https://github.com/SunDoge/simdjson-rust): Rust wrapper (bindings)
- [SimdJsonSharp](https://github.com/EgorBo/SimdJsonSharp): C# version for .NET Core (bindings and full port)
- [simdjson_nodejs](https://github.com/luizperes/simdjson_nodejs): Node.js bindings for the simdjson project.
- [simdjson_php](https://github.com/crazyxman/simdjson_php): PHP bindings for the simdjson project.
## Various References
- [Google double-conv](https://github.com/google/double-conversion/)
- [How to implement atoi using SIMD?](https://stackoverflow.com/questions/35127060/how-to-implement-atoi-using-simd)
- [Parsing JSON is a Minefield 💣](http://seriot.ch/parsing_json.php)
- https://tools.ietf.org/html/rfc7159
- The Mison implementation in rust https://github.com/pikkr/pikkr
- http://rapidjson.org/md_doc_sax.html
- https://github.com/Geal/parser_benchmarks/tree/master/json
- Gron: A command line tool that makes JSON greppable https://news.ycombinator.com/item?id=16727665
- GoogleGson https://github.com/google/gson
- Jackson https://github.com/FasterXML/jackson
- https://www.yelp.com/dataset_challenge
- RapidJSON. http://rapidjson.org/
Inspiring links:
- https://auth0.com/blog/beating-json-performance-with-protobuf/
- https://gist.github.com/shijuvar/25ad7de9505232c87034b8359543404a
- https://github.com/frankmcsherry/blog/blob/master/posts/2018-02-11.md
Validating UTF-8 takes no more than 0.7 cycles per byte:
- https://github.com/lemire/fastvalidate-utf-8 https://lemire.me/blog/2018/05/16/validating-utf-8-strings-using-as-little-as-0-7-cycles-per-byte/
## Remarks on JSON parsing
- The JSON spec defines what a JSON parser is:
> A JSON parser transforms a JSON text into another representation. A JSON parser MUST accept all texts that conform to the JSON grammar. A JSON parser MAY accept non-JSON forms or extensions. An implementation may set limits on the size of texts that it accepts. An implementation may set limits on the maximum depth of nesting. An implementation may set limits on the range and precision of numbers. An implementation may set limits on the length and character contents of strings.
* JSON is not JavaScript:
> All JSON is Javascript but NOT all Javascript is JSON. So {property:1} is invalid because property does not have double quotes around it. {'property':1} is also invalid, because it's single quoted while the only thing that can placate the JSON specification is double quoting. JSON is even fussy enough that {"property":.1} is invalid too, because you should have of course written {"property":0.1}. Also, don't even think about having comments or semicolons, you guessed it: they're invalid. (credit:https://github.com/elzr/vim-json)
* The structural characters are:
begin-array = [ left square bracket
begin-object = { left curly bracket
end-array = ] right square bracket
end-object = } right curly bracket
name-separator = : colon
value-separator = , comma
### Pseudo-structural elements
A character is pseudo-structural if and only if:
1. Not enclosed in quotes, AND
2. Is a non-whitespace character, AND
3. Its preceding character is either:
(a) a structural character, OR
(b) whitespace.
This helps as we redefine some new characters as pseudo-structural such as the characters 1, G, n in the following:
> { "foo" : 1.5, "bar" : 1.5 GEOFF_IS_A_DUMMY bla bla , "baz", null }
## Academic References
- T.Mühlbauer, W.Rödiger, R.Seilbeck, A.Reiser, A.Kemper, and T.Neumann. Instant loading for main memory databases. PVLDB, 6(14):1702–1713, 2013. (SIMD-based CSV parsing)
- Mytkowicz, Todd, Madanlal Musuvathi, and Wolfram Schulte. "Data-parallel finite-state machines." ACM SIGARCH Computer Architecture News. Vol. 42. No. 1. ACM, 2014.
- Lu, Yifan, et al. "Tree structured data processing on GPUs." Cloud Computing, Data Science & Engineering-Confluence, 2017 7th International Conference on. IEEE, 2017.
- Sidhu, Reetinder. "High throughput, tree automata based XML processing using FPGAs." Field-Programmable Technology (FPT), 2013 International Conference on. IEEE, 2013.
- Dai, Zefu, Nick Ni, and Jianwen Zhu. "A 1 cycle-per-byte XML parsing accelerator." Proceedings of the 18th annual ACM/SIGDA international symposium on Field programmable gate arrays. ACM, 2010.
- Lin, Dan, et al. "Parabix: Boosting the efficiency of text processing on commodity processors." High Performance Computer Architecture (HPCA), 2012 IEEE 18th International Symposium on. IEEE, 2012. http://parabix.costar.sfu.ca/export/1783/docs/HPCA2012/final_ieee/final.pdf
- Deshmukh, V. M., and G. R. Bamnote. "An empirical evaluation of optimization parameters in XML parsing for performance enhancement." Computer, Communication and Control (IC4), 2015 International Conference on. IEEE, 2015.
- Moussalli, Roger, et al. "Efficient XML Path Filtering Using GPUs." ADMS@ VLDB. 2011.
- Jianliang, Ma, et al. "Parallel speculative dom-based XML parser." High Performance Computing and Communication & 2012 IEEE 9th International Conference on Embedded Software and Systems (HPCC-ICESS), 2012 IEEE 14th International Conference on. IEEE, 2012.
- Li, Y., Katsipoulakis, N.R., Chandramouli, B., Goldstein, J. and Kossmann, D., 2017. Mison: a fast JSON parser for data analytics. Proceedings of the VLDB Endowment, 10(10), pp.1118-1129. http://www.vldb.org/pvldb/vol10/p1118-li.pdf
- Cameron, Robert D., et al. "Parallel scanning with bitstream addition: An xml case study." European Conference on Parallel Processing. Springer, Berlin, Heidelberg, 2011.
- Cameron, Robert D., Kenneth S. Herdy, and Dan Lin. "High performance XML parsing using parallel bit stream technology." Proceedings of the 2008 conference of the center for advanced studies on collaborative research: meeting of minds. ACM, 2008.
- Shah, Bhavik, et al. "A data parallel algorithm for XML DOM parsing." International XML Database Symposium. Springer, Berlin, Heidelberg, 2009.
- Cameron, Robert D., and Dan Lin. "Architectural support for SWAR text processing with parallel bit streams: the inductive doubling principle." ACM Sigplan Notices. Vol. 44. No. 3. ACM, 2009.
- Amagasa, Toshiyuki, Mana Seino, and Hiroyuki Kitagawa. "Energy-Efficient XML Stream Processing through Element-Skipping Parsing." Database and Expert Systems Applications (DEXA), 2013 24th International Workshop on. IEEE, 2013.
- Medforth, Nigel Woodland. "icXML: Accelerating Xerces-C 3.1. 1 using the Parabix Framework." (2013).
- Zhang, Qiang Scott. Embedding Parallel Bit Stream Technology Into Expat. Diss. Simon Fraser University, 2010.
- Cameron, Robert D., et al. "Fast Regular Expression Matching with Bit-parallel Data Streams."
- Lin, Dan. Bits filter: a high-performance multiple string pattern matching algorithm for malware detection. Diss. School of Computing Science-Simon Fraser University, 2010.
- Yang, Shiyang. Validation of XML Document Based on Parallel Bit Stream Technology. Diss. Applied Sciences: School of Computing Science, 2013.
- N. Nakasato, "Implementation of a parallel tree method on a GPU", Journal of Computational Science, vol. 3, no. 3, pp. 132-141, 2012.
## Funding
The work is supported by the Natural Sciences and Engineering Research Council of Canada under grant number RGPIN-2017-03910.
[license]: LICENSE
[license img]: https://img.shields.io/badge/License-Apache%202-blue.svg
simdjson-0.2.1/amalgamation.sh 0000775 0000000 0000000 00000011423 13521632563 0016340 0 ustar 00root root 0000000 0000000 #!/bin/bash
########################################################################
# Generates an "amalgamation build" for roaring. Inspired by similar
# script used by whefs.
########################################################################
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
echo "We are about to amalgamate all simdjson files into one source file. "
echo "See https://www.sqlite.org/amalgamation.html and https://en.wikipedia.org/wiki/Single_Compilation_Unit for rationale. "
AMAL_H="simdjson.h"
AMAL_C="simdjson.cpp"
# order does not matter
ALLCFILES="
$SCRIPTPATH/src/simdjson.cpp
$SCRIPTPATH/src/jsonioutil.cpp
$SCRIPTPATH/src/jsonminifier.cpp
$SCRIPTPATH/src/jsonparser.cpp
$SCRIPTPATH/src/stage1_find_marks.cpp
$SCRIPTPATH/src/stage2_build_tape.cpp
$SCRIPTPATH/src/parsedjson.cpp
$SCRIPTPATH/src/parsedjsoniterator.cpp
"
# order matters
ALLCHEADERS="
$SCRIPTPATH/include/simdjson/simdjson_version.h
$SCRIPTPATH/include/simdjson/portability.h
$SCRIPTPATH/include/simdjson/isadetection.h
$SCRIPTPATH/include/simdjson/simdjson.h
$SCRIPTPATH/include/simdjson/common_defs.h
$SCRIPTPATH/include/simdjson/padded_string.h
$SCRIPTPATH/include/simdjson/jsoncharutils.h
$SCRIPTPATH/include/simdjson/jsonformatutils.h
$SCRIPTPATH/include/simdjson/jsonioutil.h
$SCRIPTPATH/include/simdjson/simdprune_tables.h
$SCRIPTPATH/include/simdjson/simdutf8check_haswell.h
$SCRIPTPATH/include/simdjson/simdutf8check_westmere.h
$SCRIPTPATH/include/simdjson/simdutf8check_arm64.h
$SCRIPTPATH/include/simdjson/jsonminifier.h
$SCRIPTPATH/include/simdjson/parsedjson.h
$SCRIPTPATH/include/simdjson/stage1_find_marks.h
$SCRIPTPATH/include/simdjson/stage1_find_marks_flatten.h
$SCRIPTPATH/include/simdjson/stage1_find_marks_flatten_haswell.h
$SCRIPTPATH/include/simdjson/stage1_find_marks_macros.h
$SCRIPTPATH/include/simdjson/stage1_find_marks_westmere.h
$SCRIPTPATH/include/simdjson/stage1_find_marks_haswell.h
$SCRIPTPATH/include/simdjson/stage1_find_marks_arm64.h
$SCRIPTPATH/include/simdjson/stringparsing.h
$SCRIPTPATH/include/simdjson/stringparsing_macros.h
$SCRIPTPATH/include/simdjson/stringparsing_westmere.h
$SCRIPTPATH/include/simdjson/stringparsing_haswell.h
$SCRIPTPATH/include/simdjson/stringparsing_arm64.h
$SCRIPTPATH/include/simdjson/numberparsing.h
$SCRIPTPATH/include/simdjson/stage2_build_tape.h
$SCRIPTPATH/include/simdjson/jsonparser.h
"
for i in ${ALLCHEADERS} ${ALLCFILES}; do
test -e $i && continue
echo "FATAL: source file [$i] not found."
exit 127
done
function stripinc()
{
sed -e '/# *include *"/d' -e '/# *include * "${AMAL_H}"
{
for h in ${ALLCHEADERS}; do
dofile $h
done
} >> "${AMAL_H}"
echo "Creating ${AMAL_C}..."
echo "/* auto-generated on ${timestamp}. Do not edit! */" > "${AMAL_C}"
{
echo "#include \"${AMAL_H}\""
echo ""
echo "/* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */"
echo "#ifdef DMALLOC"
echo "#include \"dmalloc.h\""
echo "#endif"
echo ""
for h in ${ALLCFILES}; do
dofile $h
done
} >> "${AMAL_C}"
DEMOCPP="amalgamation_demo.cpp"
echo "Creating ${DEMOCPP}..."
echo "/* auto-generated on ${timestamp}. Do not edit! */" > "${DEMOCPP}"
cat <<< '
#include
#include "simdjson.h"
#include "simdjson.cpp"
int main(int argc, char *argv[]) {
if(argc < 2) {
std::cerr << "Please specify a filename " << std::endl;
}
const char * filename = argv[1];
simdjson::padded_string p = simdjson::get_corpus(filename);
simdjson::ParsedJson pj = simdjson::build_parsed_json(p); // do the parsing
if( ! pj.is_valid() ) {
std::cout << "not valid" << std::endl;
} else {
std::cout << "valid" << std::endl;
}
return EXIT_SUCCESS;
}
' >> "${DEMOCPP}"
echo "Done with all files generation. "
echo "Files have been written to directory: $PWD "
ls -la ${AMAL_C} ${AMAL_H} ${DEMOCPP}
echo "Giving final instructions:"
CPPBIN=${DEMOCPP%%.*}
echo "Try :"
echo "c++ -O3 -std=c++17 -o ${CPPBIN} ${DEMOCPP} && ./${CPPBIN} ../jsonexamples/twitter.json "
SINGLEHDR=$SCRIPTPATH/singleheader
echo "Copying files to $SCRIPTPATH/singleheader "
mkdir -p $SINGLEHDR
echo "c++ -O3 -std=c++17 -o ${CPPBIN} ${DEMOCPP} && ./${CPPBIN} ../jsonexamples/twitter.json " > $SINGLEHDR/README.md
cp ${AMAL_C} ${AMAL_H} ${DEMOCPP} $SINGLEHDR
ls $SINGLEHDR
cd $SINGLEHDR && c++ -O3 -std=c++17 -o ${CPPBIN} ${DEMOCPP} && ./${CPPBIN} ../jsonexamples/twitter.json
lowercase(){
echo "$1" | tr 'A-Z' 'a-z'
}
OS=`lowercase \`uname\``
simdjson-0.2.1/benchmark/ 0000775 0000000 0000000 00000000000 13521632563 0015300 5 ustar 00root root 0000000 0000000 simdjson-0.2.1/benchmark/CMakeLists.txt 0000664 0000000 0000000 00000000354 13521632563 0020042 0 ustar 00root root 0000000 0000000 target_include_directories(${SIMDJSON_LIB_NAME}
INTERFACE
$
$
)
add_cpp_benchmark(parse)
add_cpp_benchmark(statisticalmodel)
simdjson-0.2.1/benchmark/benchmark.h 0000664 0000000 0000000 00000035275 13521632563 0017417 0 ustar 00root root 0000000 0000000 #ifndef _BENCHMARK_H_
#define _BENCHMARK_H_
#include
#include
#include
#ifdef __x86_64__
const char *unitname = "cycles";
#define RDTSC_START(cycles) \
do { \
uint32_t cyc_high, cyc_low; \
__asm volatile("cpuid\n" \
"rdtsc\n" \
"mov %%edx, %0\n" \
"mov %%eax, %1" \
: "=r"(cyc_high), "=r"(cyc_low) \
: \
: /* no read only */ \
"%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
#define RDTSC_STOP(cycles) \
do { \
uint32_t cyc_high, cyc_low; \
__asm volatile("rdtscp\n" \
"mov %%edx, %0\n" \
"mov %%eax, %1\n" \
"cpuid" \
: "=r"(cyc_high), "=r"(cyc_low) \
: /* no read only registers */ \
: "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
#else
const char *unitname = " (clock units) ";
#define RDTSC_START(cycles) \
do { \
cycles = clock(); \
} while (0)
#define RDTSC_STOP(cycles) \
do { \
cycles = clock(); \
} while (0)
#endif
static __attribute__((noinline)) uint64_t rdtsc_overhead_func(uint64_t dummy) {
return dummy;
}
uint64_t global_rdtsc_overhead = (uint64_t)UINT64_MAX;
#define RDTSC_SET_OVERHEAD(test, repeat) \
do { \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = UINT64_MAX; \
for (int i = 0; i < repeat; i++) { \
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
RDTSC_START(cycles_start); \
test; \
RDTSC_STOP(cycles_final); \
cycles_diff = (cycles_final - cycles_start); \
if (cycles_diff < min_diff) \
min_diff = cycles_diff; \
} \
global_rdtsc_overhead = min_diff; \
} while (0)
double diff(timespec start, timespec end) {
return ((end.tv_nsec + 1000000000 * end.tv_sec) -
(start.tv_nsec + 1000000000 * start.tv_sec)) /
1000000000.0;
}
/*
* Prints the best number of operations per cycle where
* test is the function call, answer is the expected answer generated by
* test, repeat is the number of times we should repeat and size is the
* number of operations represented by test.
*/
#define BEST_TIME(name, test, expected, pre, repeat, size, verbose) \
do { \
if (global_rdtsc_overhead == UINT64_MAX) { \
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \
} \
if (verbose) \
printf("%-40s\t: ", name); \
else \
printf("\"%-40s\"", name); \
fflush(NULL); \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = (uint64_t)-1; \
double min_sumclockdiff = DBL_MAX; \
uint64_t sum_diff = 0; \
double sumclockdiff = 0; \
struct timespec time1, time2; \
for (int i = 0; i < repeat; i++) { \
pre; \
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
clock_gettime(CLOCK_REALTIME, &time1); \
RDTSC_START(cycles_start); \
if (test != expected) { \
fprintf(stderr, "not expected (%d , %d )", (int)test, (int)expected); \
break; \
} \
RDTSC_STOP(cycles_final); \
clock_gettime(CLOCK_REALTIME, &time2); \
double thistiming = diff(time1, time2); \
sumclockdiff += thistiming; \
if (thistiming < min_sumclockdiff) \
min_sumclockdiff = thistiming; \
cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \
if (cycles_diff < min_diff) \
min_diff = cycles_diff; \
sum_diff += cycles_diff; \
} \
uint64_t S = size; \
float cycle_per_op = (min_diff) / (double)S; \
float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \
double avg_gb_per_s = \
((double)S * repeat) / ((sumclockdiff)*1000.0 * 1000.0 * 1000.0); \
double max_gb_per_s = \
((double)S) / ((min_sumclockdiff)*1000.0 * 1000.0 * 1000.0); \
if (verbose) \
printf(" %7.3f %s per input byte (best) ", cycle_per_op, unitname); \
if (verbose) \
printf(" %7.3f %s per input byte (avg) ", avg_cycle_per_op, unitname); \
if (verbose) \
printf(" %7.3f GB/s (error margin: %.3f GB/s)", max_gb_per_s, \
-avg_gb_per_s + max_gb_per_s); \
if (!verbose) \
printf(" %20.3f %20.3f %20.3f %20.3f ", cycle_per_op, \
avg_cycle_per_op - cycle_per_op, max_gb_per_s, \
-avg_gb_per_s + max_gb_per_s); \
printf("\n"); \
fflush(NULL); \
} while (0)
// like BEST_TIME, but no check
#define BEST_TIME_NOCHECK(name, test, pre, repeat, size, verbose) \
do { \
if (global_rdtsc_overhead == UINT64_MAX) { \
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \
} \
if (verbose) \
printf("%-40s\t: ", name); \
fflush(NULL); \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = (uint64_t)-1; \
uint64_t sum_diff = 0; \
for (int i = 0; i < repeat; i++) { \
pre; \
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
RDTSC_START(cycles_start); \
test; \
RDTSC_STOP(cycles_final); \
cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \
if (cycles_diff < min_diff) \
min_diff = cycles_diff; \
sum_diff += cycles_diff; \
} \
uint64_t S = size; \
float cycle_per_op = (min_diff) / (double)S; \
float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \
if (verbose) \
printf(" %.3f %s per input byte (best) ", cycle_per_op, unitname); \
if (verbose) \
printf(" %.3f %s per input byte (avg) ", avg_cycle_per_op, unitname); \
if (verbose) \
printf("\n"); \
if (!verbose) \
printf(" %.3f ", cycle_per_op); \
fflush(NULL); \
} while (0)
// like BEST_TIME except that we run a function to check the result
#define BEST_TIME_CHECK(test, check, pre, repeat, size, verbose) \
do { \
if (global_rdtsc_overhead == UINT64_MAX) { \
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \
} \
if (verbose) \
printf("%-60s\t:\n", #test); \
fflush(NULL); \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = (uint64_t)-1; \
uint64_t sum_diff = 0; \
for (int i = 0; i < repeat; i++) { \
pre; \
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
RDTSC_START(cycles_start); \
test; \
RDTSC_STOP(cycles_final); \
if (!check) { \
printf("error"); \
break; \
} \
cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \
if (cycles_diff < min_diff) \
min_diff = cycles_diff; \
sum_diff += cycles_diff; \
} \
uint64_t S = size; \
float cycle_per_op = (min_diff) / (double)S; \
float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \
if (verbose) \
printf(" %.3f cycles per operation (best) ", cycle_per_op); \
if (verbose) \
printf("\t%.3f cycles per operation (avg) ", avg_cycle_per_op); \
if (verbose) \
printf("\n"); \
if (!verbose) \
printf(" %.3f ", cycle_per_op); \
fflush(NULL); \
} while (0)
#endif
simdjson-0.2.1/benchmark/distinctuseridcompetition.cpp 0000664 0000000 0000000 00000024532 13521632563 0023322 0 ustar 00root root 0000000 0000000 #include "simdjson/jsonparser.h"
#include
#include
#include
#include "benchmark.h"
// #define RAPIDJSON_SSE2 // bad for performance
// #define RAPIDJSON_SSE42 // bad for performance
#include "rapidjson/document.h"
#include "rapidjson/reader.h"
#include "rapidjson/stringbuffer.h"
#include "rapidjson/writer.h"
#include "sajson.h"
using namespace rapidjson;
bool equals(const char *s1, const char *s2) { return strcmp(s1, s2) == 0; }
void remove_duplicates(std::vector &v) {
std::sort(v.begin(), v.end());
auto last = std::unique(v.begin(), v.end());
v.erase(last, v.end());
}
void print_vec(const std::vector &v) {
for (auto i : v) {
std::cout << i << " ";
}
std::cout << std::endl;
}
void simdjson_scan(std::vector &answer,
simdjson::ParsedJson::Iterator &i) {
while (i.move_forward()) {
if (i.get_scope_type() == '{') {
bool found_user = (i.get_string_length() == 4) &&
(memcmp(i.get_string(), "user", 4) == 0);
i.move_to_value();
if (found_user) {
if (i.is_object() && i.move_to_key("id", 2)) {
if (i.is_integer()) {
answer.push_back(i.get_integer());
}
i.up();
}
}
}
}
}
__attribute__((noinline)) std::vector
simdjson_just_dom(simdjson::ParsedJson &pj) {
std::vector answer;
simdjson::ParsedJson::Iterator i(pj);
simdjson_scan(answer, i);
remove_duplicates(answer);
return answer;
}
__attribute__((noinline)) std::vector
simdjson_compute_stats(const simdjson::padded_string &p) {
std::vector answer;
simdjson::ParsedJson pj = simdjson::build_parsed_json(p);
if (!pj.is_valid()) {
return answer;
}
simdjson::ParsedJson::Iterator i(pj);
simdjson_scan(answer, i);
remove_duplicates(answer);
return answer;
}
__attribute__((noinline)) bool
simdjson_just_parse(const simdjson::padded_string &p) {
simdjson::ParsedJson pj = simdjson::build_parsed_json(p);
bool answer = !pj.is_valid();
return answer;
}
void sajson_traverse(std::vector &answer, const sajson::value &node) {
using namespace sajson;
switch (node.get_type()) {
case TYPE_ARRAY: {
auto length = node.get_length();
for (size_t i = 0; i < length; ++i) {
sajson_traverse(answer, node.get_array_element(i));
}
break;
}
case TYPE_OBJECT: {
auto length = node.get_length();
// sajson has O(log n) find_object_key, but we still visit each node anyhow
// because we need to visit all values.
for (auto i = 0u; i < length; ++i) {
auto key = node.get_object_key(i); // expected: sajson::string
bool found_user =
(key.length() == 4) && (memcmp(key.data(), "user", 4) == 0);
if (found_user) { // found a user!!!
auto user_value = node.get_object_value(i); // get the value
if (user_value.get_type() ==
TYPE_OBJECT) { // the value should be an object
// now we know that we only need one value
auto user_value_length = user_value.get_length();
auto right_index =
user_value.find_object_key(sajson::string("id", 2));
if (right_index < user_value_length) {
auto v = user_value.get_object_value(right_index);
if (v.get_type() == TYPE_INTEGER) { // check that it is an integer
answer.push_back(v.get_integer_value()); // record it!
} else if (v.get_type() == TYPE_DOUBLE) {
answer.push_back((int64_t)v.get_double_value()); // record it!
}
}
}
}
sajson_traverse(answer, node.get_object_value(i));
}
break;
}
case TYPE_NULL:
case TYPE_FALSE:
case TYPE_TRUE:
case TYPE_STRING:
case TYPE_DOUBLE:
case TYPE_INTEGER:
break;
default:
assert(false && "unknown node type");
}
}
__attribute__((noinline)) std::vector
sasjon_just_dom(sajson::document &d) {
std::vector answer;
sajson_traverse(answer, d.get_root());
remove_duplicates(answer);
return answer;
}
__attribute__((noinline)) std::vector
sasjon_compute_stats(const simdjson::padded_string &p) {
std::vector answer;
char *buffer = (char *)malloc(p.size());
memcpy(buffer, p.data(), p.size());
auto d = sajson::parse(sajson::dynamic_allocation(),
sajson::mutable_string_view(p.size(), buffer));
if (!d.is_valid()) {
free(buffer);
return answer;
}
sajson_traverse(answer, d.get_root());
free(buffer);
remove_duplicates(answer);
return answer;
}
__attribute__((noinline)) bool
sasjon_just_parse(const simdjson::padded_string &p) {
char *buffer = (char *)malloc(p.size());
memcpy(buffer, p.data(), p.size());
auto d = sajson::parse(sajson::dynamic_allocation(),
sajson::mutable_string_view(p.size(), buffer));
bool answer = !d.is_valid();
free(buffer);
return answer;
}
void rapid_traverse(std::vector &answer, const rapidjson::Value &v) {
switch (v.GetType()) {
case kObjectType:
for (Value::ConstMemberIterator m = v.MemberBegin(); m != v.MemberEnd();
++m) {
bool found_user = (m->name.GetStringLength() == 4) &&
(memcmp(m->name.GetString(), "user", 4) == 0);
if (found_user) {
const rapidjson::Value &child = m->value;
if (child.GetType() == kObjectType) {
for (Value::ConstMemberIterator k = child.MemberBegin();
k != child.MemberEnd(); ++k) {
if (equals(k->name.GetString(), "id")) {
const rapidjson::Value &val = k->value;
if (val.GetType() == kNumberType) {
answer.push_back(val.GetInt64());
}
}
}
}
}
rapid_traverse(answer, m->value);
}
break;
case kArrayType:
for (Value::ConstValueIterator i = v.Begin(); i != v.End();
++i) { // v.Size();
rapid_traverse(answer, *i);
}
break;
case kNullType:
case kFalseType:
case kTrueType:
case kStringType:
case kNumberType:
default:
break;
}
}
__attribute__((noinline)) std::vector
rapid_just_dom(rapidjson::Document &d) {
std::vector answer;
rapid_traverse(answer, d);
remove_duplicates(answer);
return answer;
}
__attribute__((noinline)) std::vector
rapid_compute_stats(const simdjson::padded_string &p) {
std::vector answer;
char *buffer = (char *)malloc(p.size() + 1);
memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0';
rapidjson::Document d;
d.ParseInsitu(buffer);
if (d.HasParseError()) {
free(buffer);
return answer;
}
rapid_traverse(answer, d);
free(buffer);
remove_duplicates(answer);
return answer;
}
__attribute__((noinline)) bool
rapid_just_parse(const simdjson::padded_string &p) {
char *buffer = (char *)malloc(p.size() + 1);
memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0';
rapidjson::Document d;
d.ParseInsitu(buffer);
bool answer = d.HasParseError();
free(buffer);
return answer;
}
int main(int argc, char *argv[]) {
bool verbose = false;
bool just_data = false;
int c;
while ((c = getopt(argc, argv, "vt")) != -1)
switch (c) {
case 't':
just_data = true;
break;
case 'v':
verbose = true;
break;
default:
abort();
}
if (optind >= argc) {
std::cerr
<< "Using different parsers, we compute the content statistics of "
"JSON documents."
<< std::endl;
std::cerr << "Usage: " << argv[0] << " " << std::endl;
std::cerr << "Or " << argv[0] << " -v " << std::endl;
exit(1);
}
const char *filename = argv[optind];
if (optind + 1 < argc) {
std::cerr << "warning: ignoring everything after " << argv[optind + 1]
<< std::endl;
}
simdjson::padded_string p;
try {
simdjson::get_corpus(filename).swap(p);
} catch (const std::exception &e) { // caught by reference to base
std::cout << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
if (verbose) {
std::cout << "Input has ";
if (p.size() > 1024 * 1024)
std::cout << p.size() / (1024 * 1024) << " MB ";
else if (p.size() > 1024)
std::cout << p.size() / 1024 << " KB ";
else
std::cout << p.size() << " B ";
std::cout << std::endl;
}
std::vector s1 = simdjson_compute_stats(p);
if (verbose) {
printf("simdjson: ");
print_vec(s1);
}
std::vector s2 = rapid_compute_stats(p);
if (verbose) {
printf("rapid: ");
print_vec(s2);
}
std::vector s3 = sasjon_compute_stats(p);
if (verbose) {
printf("sasjon: ");
print_vec(s3);
}
assert(s1 == s2);
assert(s1 == s3);
size_t size = s1.size();
int repeat = 500;
int volume = p.size();
if (just_data) {
printf(
"name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
}
BEST_TIME("simdjson ", simdjson_compute_stats(p).size(), size, , repeat,
volume, !just_data);
BEST_TIME("rapid ", rapid_compute_stats(p).size(), size, , repeat, volume,
!just_data);
BEST_TIME("sasjon ", sasjon_compute_stats(p).size(), size, , repeat, volume,
!just_data);
BEST_TIME("simdjson (just parse) ", simdjson_just_parse(p), false, , repeat,
volume, !just_data);
BEST_TIME("rapid (just parse) ", rapid_just_parse(p), false, , repeat,
volume, !just_data);
BEST_TIME("sasjon (just parse) ", sasjon_just_parse(p), false, , repeat,
volume, !just_data);
simdjson::ParsedJson dsimdjson = simdjson::build_parsed_json(p);
BEST_TIME("simdjson (just dom) ", simdjson_just_dom(dsimdjson).size(), size,
, repeat, volume, !just_data);
char *buffer = (char *)malloc(p.size());
memcpy(buffer, p.data(), p.size());
rapidjson::Document drapid;
drapid.ParseInsitu(buffer);
BEST_TIME("rapid (just dom) ", rapid_just_dom(drapid).size(), size, , repeat,
volume, !just_data);
memcpy(buffer, p.data(), p.size());
auto dsasjon = sajson::parse(sajson::dynamic_allocation(),
sajson::mutable_string_view(p.size(), buffer));
BEST_TIME("sasjon (just dom) ", sasjon_just_dom(dsasjon).size(), size, ,
repeat, volume, !just_data);
free(buffer);
}
simdjson-0.2.1/benchmark/linux/ 0000775 0000000 0000000 00000000000 13521632563 0016437 5 ustar 00root root 0000000 0000000 simdjson-0.2.1/benchmark/linux/linux-perf-events.h 0000664 0000000 0000000 00000005223 13521632563 0022205 0 ustar 00root root 0000000 0000000 // https://github.com/WojciechMula/toys/blob/master/000helpers/linux-perf-events.h
#pragma once
#ifdef __linux__
#include // for __NR_perf_event_open
#include // for perf event constants
#include // for ioctl
#include // for syscall
#include // for errno
#include // for memset
#include
#include
#include
template class LinuxEvents {
int fd;
bool working;
perf_event_attr attribs;
int num_events;
std::vector temp_result_vec;
std::vector ids;
public:
explicit LinuxEvents(std::vector config_vec) : fd(0), working(true) {
memset(&attribs, 0, sizeof(attribs));
attribs.type = TYPE;
attribs.size = sizeof(attribs);
attribs.disabled = 1;
attribs.exclude_kernel = 1;
attribs.exclude_hv = 1;
attribs.sample_period = 0;
attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
const int pid = 0; // the current process
const int cpu = -1; // all CPUs
const unsigned long flags = 0;
int group = -1; // no group
num_events = config_vec.size();
ids.resize(config_vec.size());
uint32_t i = 0;
for (auto config : config_vec) {
attribs.config = config;
fd = syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags);
if (fd == -1) {
report_error("perf_event_open");
}
ioctl(fd, PERF_EVENT_IOC_ID, &ids[i++]);
if (group == -1) {
group = fd;
}
}
temp_result_vec.resize(num_events * 2 + 1);
}
~LinuxEvents() { close(fd); }
inline void start() {
if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
report_error("ioctl(PERF_EVENT_IOC_RESET)");
}
if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
report_error("ioctl(PERF_EVENT_IOC_ENABLE)");
}
}
inline void end(std::vector &results) {
if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
report_error("ioctl(PERF_EVENT_IOC_DISABLE)");
}
if (read(fd, temp_result_vec.data(), temp_result_vec.size() * 8) == -1) {
report_error("read");
}
// our actual results are in slots 1,3,5, ... of this structure
// we really should be checking our ids obtained earlier to be safe
for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) {
results[i / 2] = temp_result_vec[i];
}
}
private:
void report_error(const std::string &context) {
if (working)
std::cerr << (context + ": " + std::string(strerror(errno))) << std::endl;
working = false;
}
};
#endif
simdjson-0.2.1/benchmark/minifiercompetition.cpp 0000664 0000000 0000000 00000013445 13521632563 0022070 0 ustar 00root root 0000000 0000000 #include
#include
#include "benchmark.h"
#include "simdjson/jsonioutil.h"
#include "simdjson/jsonminifier.h"
#include "simdjson/jsonparser.h"
// #define RAPIDJSON_SSE2 // bad
// #define RAPIDJSON_SSE42 // bad
#include "rapidjson/document.h"
#include "rapidjson/reader.h" // you have to check in the submodule
#include "rapidjson/stringbuffer.h"
#include "rapidjson/writer.h"
#include "sajson.h"
using namespace simdjson;
using namespace rapidjson;
std::string rapid_stringme_insitu(char *json) {
Document d;
d.ParseInsitu(json);
if (d.HasParseError()) {
std::cerr << "problem!" << std::endl;
return ""; // should do something
}
StringBuffer buffer;
Writer writer(buffer);
d.Accept(writer);
return buffer.GetString();
}
std::string rapid_stringme(char *json) {
Document d;
d.Parse(json);
if (d.HasParseError()) {
std::cerr << "problem!" << std::endl;
return ""; // should do something
}
StringBuffer buffer;
Writer writer(buffer);
d.Accept(writer);
return buffer.GetString();
}
int main(int argc, char *argv[]) {
int c;
bool verbose = false;
bool just_data = false;
while ((c = getopt(argc, argv, "vt")) != -1)
switch (c) {
case 't':
just_data = true;
break;
case 'v':
verbose = true;
break;
default:
abort();
}
if (optind >= argc) {
std::cerr << "Usage: " << argv[0] << " " << std::endl;
exit(1);
}
const char *filename = argv[optind];
simdjson::padded_string p;
try {
simdjson::get_corpus(filename).swap(p);
} catch (const std::exception &e) { // caught by reference to base
std::cout << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
if (verbose) {
std::cout << "Input has ";
if (p.size() > 1024 * 1024)
std::cout << p.size() / (1024 * 1024) << " MB ";
else if (p.size() > 1024)
std::cout << p.size() / 1024 << " KB ";
else
std::cout << p.size() << " B ";
std::cout << std::endl;
}
char *buffer = simdjson::allocate_padded_buffer(p.size() + 1);
memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0';
int repeat = 50;
int volume = p.size();
if (just_data) {
printf(
"name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
}
size_t strlength = rapid_stringme((char *)p.data()).size();
if (verbose)
std::cout << "input length is " << p.size() << " stringified length is "
<< strlength << std::endl;
BEST_TIME_NOCHECK("despacing with RapidJSON",
rapid_stringme((char *)p.data()), , repeat, volume,
!just_data);
BEST_TIME_NOCHECK(
"despacing with RapidJSON Insitu", rapid_stringme_insitu((char *)buffer),
memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
memcpy(buffer, p.data(), p.size());
size_t outlength = simdjson::json_minify((const uint8_t *)buffer, p.size(),
(uint8_t *)buffer);
if (verbose)
std::cout << "json_minify length is " << outlength << std::endl;
uint8_t *cbuffer = (uint8_t *)buffer;
BEST_TIME("json_minify", simdjson::json_minify(cbuffer, p.size(), cbuffer),
outlength, memcpy(buffer, p.data(), p.size()), repeat, volume,
!just_data);
printf("minisize = %zu, original size = %zu (minified down to %.2f percent "
"of original) \n",
outlength, p.size(), outlength * 100.0 / p.size());
/***
* Is it worth it to minify before parsing?
***/
rapidjson::Document d;
BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(),
false, memcpy(buffer, p.data(), p.size()), repeat, volume,
!just_data);
char *mini_buffer = simdjson::allocate_padded_buffer(p.size() + 1);
size_t minisize = simdjson::json_minify((const uint8_t *)p.data(), p.size(),
(uint8_t *)mini_buffer);
mini_buffer[minisize] = '\0';
BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(),
false, memcpy(buffer, mini_buffer, p.size()), repeat, volume,
!just_data);
size_t ast_buffer_size = p.size() * 2;
size_t *ast_buffer = (size_t *)malloc(ast_buffer_size * sizeof(size_t));
BEST_TIME(
"sajson orig",
sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size),
sajson::mutable_string_view(p.size(), buffer))
.is_valid(),
true, memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
BEST_TIME(
"sajson despaced",
sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size),
sajson::mutable_string_view(minisize, buffer))
.is_valid(),
true, memcpy(buffer, mini_buffer, p.size()), repeat, volume, !just_data);
simdjson::ParsedJson pj;
bool is_alloc_ok = pj.allocate_capacity(p.size(), 1024);
if (!is_alloc_ok) {
fprintf(stderr, "failed to allocate memory\n");
return EXIT_FAILURE;
}
bool automated_reallocation = false;
BEST_TIME("simdjson orig",
simdjson::json_parse((const uint8_t *)buffer, p.size(), pj,
automated_reallocation),
true, memcpy(buffer, p.data(), p.size()), repeat, volume,
!just_data);
simdjson::ParsedJson pj2;
bool is_alloc_ok2 = pj2.allocate_capacity(p.size(), 1024);
if (!is_alloc_ok2) {
fprintf(stderr, "failed to allocate memory\n");
return EXIT_FAILURE;
}
automated_reallocation = false;
BEST_TIME("simdjson despaced",
simdjson::json_parse((const uint8_t *)buffer, minisize, pj2,
automated_reallocation),
true, memcpy(buffer, mini_buffer, p.size()), repeat, volume,
!just_data);
free(buffer);
free(ast_buffer);
free(mini_buffer);
}
simdjson-0.2.1/benchmark/parse.cpp 0000664 0000000 0000000 00000030523 13521632563 0017121 0 ustar 00root root 0000000 0000000 #include
#include
#ifndef _MSC_VER
#include
#include
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include