TensorFlow
local Python XLA client
@Vengineer
2018/02/04 (r1.6)
2018/07/14 (r1.9)
2018/08/03 (JAX情報追記)
TensorFlow XLAのロゴ
https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla
ブログ (2007年~) : Vengineerの戯言
 http://blogs.yahoo.co.jp/verification_engineer
SlideShare :
 https://www.slideshare.net/ssuser479fa3
Twitter (2009年~) :
@Vengineer
local_client
local_service backend
・CPU
・GPU
・Interpreter
・(Accelerator)
Service
Client
TensorFlow XLAの内部構成
compiler/xla/client/local_client.cc
StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
const XlaComputation& computation,
const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
const ExecutableBuildOptions& options) {
ExecutableBuildOptions updated_options = options;
if (options.device_ordinal() == -1) {
updated_options.set_device_ordinal(default_device_ordinal());
VLOG(3) << "Set device ordinal to default value of: "
<< updated_options.device_ordinal();
}
TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
local_service_->CompileExecutable(
computation, argument_layouts, updated_options));
return WrapUnique(new LocalExecutable(std::move(executable),
local_service_->mutable_backend(),
updated_options));
}
local_service の CompileExecutable で
サービス側で computation をコンパイルして、
Executable にして返す。
compiler/xla/service/local_service.cc
StatusOr<std::unique_ptr<Executable>> LocalService:: CompileExecutable(
const XlaComputation& computation,
const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
const ExecutableBuildOptions& build_options) {
const HloModuleProto& proto = computation.proto();
....
TF_ASSIGN_OR_RETURN(
se::StreamExecutor * executor,
execute_backend_->stream_executor(build_options.device_ordinal()));
return BuildExecutable(proto, std::move(module_config),
execute_backend_.get(), executor,
build_options.device_allocator());
}
CompileExecutable では、
Serviceの BuildExecutablleにて、
Executableを生成する。
compiler/xla/service/service.cc
StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
const std::vector<const HloModuleProto*>& module_protos,
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
DeviceMemoryAllocator* device_allocator) {
....
TF_ASSIGN_OR_RETURN(
std::vector<std::unique_ptr<Executable>> executables,
backend->compiler()->Compile(std::move(modules),
std::move(executors),
device_allocator));
....
return std::move(executables);
}
BuildExecutable では、
backendのcompilerにて、Compileし、
Executableを生成する。
XLAグラフに変換
最適化、その1
ターゲットハードウェアの
実行オブジェクト
ターゲットハードウェアに
依存しない最適化
HLO (High Level Optimizer)
XLAグラフ
最適化、その2
コード生成
ターゲットハードウェアに
依存する最適化
LLO (Low Level Optimizer)
TensorFow Graph
実行オブジェクト
XLAグラフ
LLVM Compiler::Compile (r1.5~)
RunHloPass
RunBackend
Python library and C++ bindings for
creating and compiling local XLA …
16 Dec 2017
https://github.com/tensorflow/tensorflow/commit/75a91cf3be635af4f6004f20f3c3cc50c37d3145#diff
-08e582ec7fa24a7ddc125e1378564071
https://github.com/tensorflow/tensorflow/tree/r1.6/tensorflow/compiler/xla/python
BUILD
__init__.py
local_computation_builder.cc
local_computation_builder.h
local_computation_builder.i
numpy_bridge.cc
numpy_bridge.h
xla.i
xla_client.py
xla_client_test.py
tensorflow/compiler/xla/python
class LocalComputationTest(unittest.TestCase):
"""Base class for running an XLA Computation through the local client."""
class ComputationsWithConstantsTest(LocalComputationTest):
"""Tests focusing on Constant ops."""
class ParametersTest(LocalComputationTest):
"""Tests focusing on Parameter ops and argument-passing."""
class LocalBufferTest(LocalComputationTest):
"""Tests focusing on execution with LocalBuffers."""
class SingleOpTest(LocalComputationTest):
"""Tests for single ops."""
class EmbeddedComputationsTest(LocalComputationTest):
"""Tests for XLA graphs with embedded computations (such as maps)."""
class ErrorTest(LocalComputationTest):
xla_client_test.py
class ComputationsWithConstantsTest (LocalComputationTest):
"""Tests focusing on Constant ops."""
def testConstantScalarSumF32 (self):
c = self._NewComputation()
c.Add(c.ConstantF32Scalar(1.11), c.ConstantF32Scalar(3.14))
self._ExecuteAndCompareClose (c, expected=4.25)
def testConstantScalarSumF64 (self):
c = self._NewComputation()
c.Add(c.ConstantF64Scalar(1.11), c.ConstantF64Scalar(3.14))
self._ExecuteAndCompareClose (c, expected=4.25)
def testConstantScalarSumS32 (self):
c = self._NewComputation()
c.Add(c.ConstantS32Scalar(1), c.ConstantS32Scalar(2))
self._ExecuteAndCompareClose (c, expected=3)
xla_client_test.py
Not
Abs
Exp
Expm1
Floor
Round
Ceil
Log
Sign
Cos
Sin
Tanh
Sqrt
Square
IsFinite
Reciprocal
Neg
Sort
xla_client.py
Eq
Ne
Ge
Gt
Lt
Le
Add
Sub
Mul
Div
Rem
Max
Min
And
Or
Xor
Pow
def _ExecuteAndCompareClose (self, c, arguments=(), expected=None):
self._ExecuteAndAssertWith (np.testing.assert_allclose, c, arguments,
expected)
def _ExecuteAndAssertWith (self, assert_func, c, arguments, expected):
assert expected is not None
result = self._Execute(c, arguments)
# Numpy's comparison methods are a bit too lenient by treating inputs as
# "array-like", meaning that scalar 4 will be happily compared equal to
# [[4]]. We'd like to be more strict so assert shapes as well.
self.assertEqual(np.asanyarray(result).shape, np.asanyarray(expected).shape)
assert_func(result, expected)
xla_client_test.py
from tensorflow.compiler.xla.python import xla_client
class LocalComputationTest (unittest.TestCase):
"""Base class for running an XLA Computation through the local client."""
def _NewComputation(self, name=None):
if name is None:
name = self.id()
return xla_client.ComputationBuilder (name)
def _Execute(self, c, arguments):
compiled_c = c.Build().CompileWithExampleArguments (arguments)
return compiled_c. Execute(arguments)
xla_client_test.py
c_api : CompiledLocalComputationクラスの
Executeメソッドが呼ばれる
from tensorflow.compiler.xla.python import pywrap_xla as c_api
class ComputationBuilder(object):
"""XLA computation builder.
Enqueues XLA ops in sequence and in order to build a
LocalComputation, which in turn can be compiled into a
CompiledLocalComputation, which in turn can be locally executed.
"""
# The methods of this class map 1-to-1 onto the XLA C++
# computation builder API. Therefore, there's no need to laboriously list
# arguments and return values for every method, especially where it's obvious.
#
# pylint: disable=g-doc-return-or-yield
# pylint: disable=g-doc-args
xla_client.py
tf_py_wrap_cc(
name = "pywrap_xla",
srcs = ["xla.i"],
swig_includes = [
"local_computation_builder.i",
],
deps = [
":local_computation_builder",
":numpy_bridge",
"//tensorflow/compiler/xla:literal_util",
"//tensorflow/compiler/xla:shape_util",
"//tensorflow/compiler/xla:xla_data_proto",
"//tensorflow/compiler/xla/service: cpu_plugin",
],
)
BUILD
ComputationBuilderクラス
def __init__(self, name):
self._client = c_api.LocalComputationBuilder(name.encode('utf8'))
self._parameter_numbering = itertools.count()
def Build(self):
return LocalComputation(self._client.Build(), is_compiled=False)
c_apiのLocalComputationBuilderを利用する
_client.Build()にて、c_apiのLocalComputationBuilderを
生成
xla_client.py
class LocalComputation(object):
"""Python wrapper for a local XLA Computation .
A LocalComputation can be executed if it is compiled. Otherwise, it
can still be used as a Computation where required by the
ComputationBuilder methods.
"""
def __init__(self, c_local_computation, is_compiled):
self.c_local_computation = c_local_computation
self.is_compiled = is_compiled
# Ensure a reference to C-based destructor for use in __del__.
if is_compiled:
self._delete = c_api.DeleteCompiledLocalComputation
else:
self._delete = c_api.DeleteLocalComputation
xla_client.py
LocalComputationクラス
def Compile(self, argument_shapes=(), compile_options=None,
layout_fn=None):
if self.is_compiled:
raise ValueError('Attempt to compile a compiled local XLA computation.')
if layout_fn:
argument_shapes = [
shape.map_leaves(layout_fn) for shape in argument_shapes
]
return LocalComputation(
self.c_local_computation.Compile (argument_shapes, compile_options),
is_compiled=True)
xla_client.py
def Execute(self, arguments=(), layout_fn=None):
"""Execute with Python values as arguments and return value."""
if not self.is_compiled:
raise ValueError('Cannot execute an uncompiled local XLA computation.')
argument_shapes = [Shape.from_numpy(arg) for arg in arguments]
if layout_fn:
argument_shapes = [
shape.map_leaves(layout_fn) for shape in argument_shapes
]
else:
argument_shapes = [None for shape in argument_shapes]
arguments = tuple(map(require_numpy_array_layout, arguments))
return self.c_local_computation.Execute (arguments, argument_shapes)
LocalComputationクラス
local_computation_builder.h
// Wraps the ComputationBuilder API in order to:
// - Support consumption by SWIG in order to be made available to
// Python.
// - Set up the underlying builder to use the client library's
// LocalClient.
// - Wrap Computations in LocalComputations for Python access.
// - Correspondingly unwrap incoming LocalComputations.
class LocalComputationBuilder {
public:
LocalComputationBuilder(const string& computation_name);
StatusOr<LocalComputation* > Build();
c_api : LocalComputationBuilderクラス
local_computation_builder.cc
LocalComputationBuilder::LocalComputationBuilder
(const string& computation_name)
: builder_(GetOrCreateLocalClient(), computation_name) {}
StatusOr<LocalComputation* > LocalComputationBuilder:: Build() {
TF_ASSIGN_OR_RETURN(Computation computation, builder_.Build());
return new LocalComputation(std::move(computation));
}
c_api : LocalComputationBuilderクラス
local_computation_builder.cc
StatusOr<CompiledLocalComputation* > LocalComputation:: Compile(
const std::vector<Shape>& argument_shapes,
const ExecutableBuildOptions* build_options) {
std::vector<const Shape*> argument_shape_pointers;
argument_shape_pointers.reserve(argument_shapes.size());
for (auto& argument_shape : argument_shapes) {
argument_shape_pointers.push_back(&argument_shape);
}
c_api : LocalComputationクラス
local_computation_builder.cc
LocalClient* client = GetOrCreateLocalClient();
ExecutableBuildOptions options;
if (build_options != nullptr) {
options = *build_options;
}
TF_ASSIGN_OR_RETURN(
auto local_executable,
client->Compile(computation_, argument_shape_pointers, options));
return new CompiledLocalComputation(std::move(local_executable)) ;
}
c_api : LocalComputationクラス
tensorflow/compile/xla/client/local_client.cc
StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
const Computation& computation,
const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
const ExecutableBuildOptions& options) {
ExecutableBuildOptions updated_options = options;
if (options.device_ordinal() == -1) {
updated_options.set_device_ordinal(default_device_ordinal());
}
TF_ASSIGN_OR_RETURN(
std::unique_ptr<Executable> executable,
local_service_->CompileExecutable (computation.handle(), argument_layouts,
updated_options));
return WrapUnique(new LocalExecutable(std::move(executable),
local_service_->mutable_backend(),
updated_options));
}
LocalClient::compile
local_computation_builder.h
CompiledLocalComputation::CompiledLocalComputation(
std::unique_ptr<LocalExecutable> executable)
: executable_(std::move(executable)) {}
c_api : CompiledLocalComputationクラス
local_computation_builder.cc
StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation:: Execute(
const std::vector<Literal>& arguments,
const std::vector<tensorflow::gtl::optional<Shape>>& shapes_with_layout) {
// 途中略
StatusOr<std::unique_ptr<ScopedShapedBuffer>> result_buffer_status =
executable_->Run(argument_buffers, options);
if (!result_buffer_status.ok()) {
results[replica] = result_buffer_status.status();
return;
}
// 途中略
}
c_api : CompiledLocalComputationクラス
tensorflow/compile/xla/client/local_client.cc
StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
ExecutableRunOptions run_options) {
TF_RETURN_IF_ERROR(
ValidateExecutionOptions(arguments, run_options, *backend_));
Backend::StreamPtr stream;
if (run_options.stream() == nullptr) {
TF_ASSIGN_OR_RETURN(
stream, BorrowStreamForDevice(run_options.device_ordinal(), backend_));
run_options.set_stream(stream.get());
}
if (run_options.allocator() == nullptr) {
run_options.set_allocator(backend_->memory_allocator());
}
LocalExecutable::Run
tensorflow/compile/xla/client/local_client.cc
ServiceExecutableRunOptions service_options(
run_options, backend_->StreamBorrower(),
backend_->eigen_intra_op_thread_pool());
if (executable_->dumping()) {
return ExecuteAndDump(&service_options, arguments);
}
TF_ASSIGN_OR_RETURN(
std::unique_ptr<ShapedBuffer> result,
executable_->ExecuteOnStreamWrapper(
&service_options, run_options.execution_profile(), arguments));
return ScopedShapedBuffer::MakeScoped(result.get(), run_options.allocator());
}
LocalExecutable::Run
local_client
local_service
TensorFlow local Python XLA client
CompiledLocalComputation
LocalComputation
ComputationBuilder
Python
C
InFeed & OutFeed
入力値の設定
と
出力値の獲得
def testInfeedS32Values(self):
to_infeed = NumpyArrayS32([1, 2, 3, 4])
c = self._NewComputation()
c.Infeed(xla_client.Shape.from_numpy(to_infeed[0]))
compiled_c = c.Build().CompileWithExampleArguments()
for item in to_infeed:
xla_client.transfer_to_infeed (item)
for item in to_infeed:
result = compiled_c.Execute()
self.assertEqual(result, item)
Infeed : xla_client_test.py
def transfer_to_infeed(value, replica_number=None):
"""Transfers the given value into the XLA infeed queue .
XLA's infeed queue is a single queue that feeds the "XLA virtual machine" with
a totally ordered stream of values. This is dequeued from XLA computations via
the Infeed() operation.
Args:
value: the value that the caller would like to enqueue into the XLA infeed
queue
replica_number: the replica number to infeed the value to -- if not
provided, then the default replica (trivially replica 0) is used.
"""
if replica_number is None:
c_api.TransferToInfeedLocal(require_numpy_array_layout(value))
else:
c_api.TransferToInfeedLocalReplica(
require_numpy_array_layout(value), replica_number)
transfer_to_infeed : xla_client.py
def testInfeedThenOutfeedS32(self):
to_round_trip = NumpyArrayS32([1, 2, 3, 4])
c = self._NewComputation()
x = c.Infeed(xla_client.Shape.from_numpy(to_round_trip[0]))
c.Outfeed(x)
compiled_c = c.Build().CompileWithExampleArguments()
for want in to_round_trip:
execution = threading.Thread(target=compiled_c.Execute)
execution.start()
xla_client.transfer_to_infeed(want)
got = xla_client.transfer_from_outfeed (
xla_client.Shape.from_numpy(to_round_trip[0]))
execution.join()
self.assertEqual(want, got)
Outfeed : xla_client_test.py
def transfer_from_outfeed(shape, replica_number=None):
"""Transfers a literal of the given shape from replica_number's outfeed .
Args:
shape: The shape of the value to transfer from outfeed.
replica_number: The replica number ordinal to transfer the outfeed value
from. (Each replica has a distinct outfeed queue.)
Returns:
The literal value that is produced from the outfeed queue.
"""
return c_api.TransferFromOutfeedLocalReplica(shape, replica_number or 0)
transfer_from_outfeed : xla_client.py
JAX (Google)
2018/08/04追記
https://blogs.yahoo.co.jp/verification_engineer/71495058.html
元論文は、2018年2月の SysML の論文
Compiling machine learning programs via high-level tracing
JAX
 xla_builder
local Python XLA clientを使っている
JAX trace => HLO
Python => JAX で速くなる (CPU)
Cloud TPU でスケールする?
ブログ (2007年~) : Vengineerの戯言
 http://blogs.yahoo.co.jp/verification_engineer
SlideShare :
 https://www.slideshare.net/ssuser479fa3
Twitter (2009年~) :
@Vengineer
ありがとうございました

TensorFlow local Python XLA client