1

我一直在网上搜索,但似乎找不到简单的答案。

基本上,我有一个带有一个 GPU 的台式机,以及我的主要代码所在的笔记本电脑。我的目标是使用分布式张量流在我的笔记本电脑上执行 python 代码,同时使用 GPU over IP。

这是我到目前为止所尝试的:

# for the desktop with one GPU
import tensorflow as tf
cluster = tf.train.ClusterSpec(["worker": ["192.168.1.11:2222"]])
server = tf.distribute.Server(cluster, job_name="worker", task_index=0)
server.start()
server.join()

我已经检查过它192.168.1.11:2222正在收听,但问题是我的笔记本电脑无法连接。

# for laptop connecting to the desktop
import os
import json

os.environ["TF_CONFIG"] = json.dumps({
    "cluster": {
        "worker": ["192.168.1.11:2222"],
    },
  "task": {"type": "worker", "index": 0}
})


import tensorflow as tf
import numpy as np

strategy = tf.distribute.OneDeviceStrategy(device="/job:worker/task:0/device:GPU:0")
with strategy.scope():


    text = open('shakespeare.txt', 'rb').read().decode(encoding='utf-8')
    print ('Length of text: {} characters'.format(len(text)))
    vocab = sorted(set(text))
    print ('{} unique characters'.format(len(vocab)))

    char2idx = {u:i for i, u in enumerate(vocab)}
    idx2char = np.array(vocab)

    text_as_int = np.array([char2idx[c] for c in text])
    tensored_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

    maxValue = 28
    seq_length = 100
    batch_size = 64

    slicedSequences = tensored_dataset.batch(seq_length, drop_remainder=True)

    def split_input_target(chunk):
        input_text = chunk[:-1]
        target_text = chunk[1:]
        return input_text, target_text
    slicedInputOutput = slicedSequences.map(split_input_target)
    dataset = slicedInputOutput.batch(batch_size)

    vocab_size = len(vocab)
    embedding_dim = 256
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),)
    model.add(tf.keras.layers.LSTM(units=512, return_sequences=True))
    # model.add(tf.keras.layers.LSTM(units=1, return_sequences=True, input_shape=(seq_length, 1)))
    model.add(tf.keras.layers.Dense(units=len(vocab), activation='sigmoid'))

    model.summary()

    def loss(labels, logits):
      return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
    model.compile(optimizer='adam', loss=loss)

    # model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(dataset, epochs=1)

    # def getPrediction(inputSentence):
    #     inputSentence = inputSentence[:seq_length]
    #     result = model.predict(np.array([dictionary[i] for i in inputSentence.lower()]).reshape(1, -1, 1).astype(np.float32)).reshape(-1)
    #     return ''.join([fromChar(dictionary, int(round(perChar*maxValue))) for perChar in result])
    # print(getPrediction('Hello world'))

    inputTest, outputTest = list(dataset.take(1))[0]
    def test():
        example_batch_predictions = model(inputTest)
        sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
        sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
        print("Input: \n", repr("".join(idx2char[inputTest[0]])))
        print()
        print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices])))

问题是当我定义时os.environ['TF_CONFIG'],会发生此错误:

RuntimeError: /job:worker/replica:0/task:0/device:GPU:0 unknown device.

任何帮助/建议将不胜感激。

4

0 回答 0