重复的DNA序列

187. 重复的DNA序列

DNA序列 由一系列核苷酸组成，缩写为 'A', 'C', 'G' 和 'T'.。

例如，"ACGAATTCCG" 是一个 DNA序列 。

在研究 DNA 时，识别 DNA 中的重复序列非常有用。

给定一个表示 DNA序列 的字符串 s ，返回所有在 DNA 分子中出现不止一次的 长度为 10 的序列(子字符串)。你可以按 任意顺序 返回答案。

示例 1：

输入：s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"
输出：["AAAAACCCCC","CCCCCAAAAA"]

示例 2：

输入：s = "AAAAAAAAAAAAA"
输出：["AAAAAAAAAA"]

提示：

0 <= s.length <= 10⁵
s[i]=='A'、'C'、'G' or 'T'

原站题解

去查看

上次编辑到这里，代码来自缓存点击恢复默认模板

class Solution { public: vector<string> findRepeatedDnaSequences(string s) { } };

cpp 解法, 执行用时: 40 ms, 内存消耗: 15.6 MB, 提交时间: 2023-11-05 22:33:38

class Solution {
    const int L = 10;
    unordered_map<char, int> bin = {{'A', 0}, {'C', 1}, {'G', 2}, {'T', 3}};
public:
    vector<string> findRepeatedDnaSequences(string s) {
        vector<string> ans;
        int n = s.length();
        if (n <= L) {
            return ans;
        }
        int x = 0;
        for (int i = 0; i < L - 1; ++i) {
            x = (x << 2) | bin[s[i]];
        }
        unordered_map<int, int> cnt;
        for (int i = 0; i <= n - L; ++i) {
            x = ((x << 2) | bin[s[i + L - 1]]) & ((1 << (L * 2)) - 1);
            if (++cnt[x] == 2) {
                ans.push_back(s.substr(i, L));
            }
        }
        return ans;
    }
};

cpp 解法, 执行用时: 48 ms, 内存消耗: 23.1 MB, 提交时间: 2023-11-05 22:33:12

class Solution {
    const int L = 10;
public:
    vector<string> findRepeatedDnaSequences(string s) {
        vector<string> ans;
        unordered_map<string, int> cnt;
        int n = s.length();
        for (int i = 0; i <= n - L; ++i) {
            string sub = s.substr(i, L);
            if (++cnt[sub] == 2) {
                ans.push_back(sub);
            }
        }
        return ans;
    }
};

javascript 解法, 执行用时: 76 ms, 内存消耗: 48.4 MB, 提交时间: 2022-12-14 11:57:51

/**
 * @param {string} s
 * @return {string[]}
 */
var findRepeatedDnaSequences = function(s) {
    const L = 10;
    const bin = new Map();
    bin.set('A', 0);
    bin.set('C', 1);
    bin.set('G', 2);
    bin.set('T', 3);
    
    const ans = [];
    const n = s.length;
    if (n <= L) {
        return ans;
    }
    let x = 0;
    for (let i = 0; i < L - 1; ++i) {
        x = (x << 2) | bin.get(s[i]);
    }
    const cnt = new Map();
    for (let i = 0; i <= n - L; ++i) {
        x = ((x << 2) | bin.get(s[i + L - 1])) & ((1 << (L * 2)) - 1);
        cnt.set(x, (cnt.get(x) || 0) + 1);
        if (cnt.get(x) === 2) {
            ans.push(s.slice(i, i + L));
        }
    }
    return ans;
};

golang 解法, 执行用时: 16 ms, 内存消耗: 8.8 MB, 提交时间: 2022-12-14 11:57:37

const L = 10
var bin = map[byte]int{'A': 0, 'C': 1, 'G': 2, 'T': 3}

func findRepeatedDnaSequences(s string) (ans []string) {
    n := len(s)
    if n <= L {
        return
    }
    x := 0
    for _, ch := range s[:L-1] {
        x = x<<2 | bin[byte(ch)]
    }
    cnt := map[int]int{}
    for i := 0; i <= n-L; i++ {
        x = (x<<2 | bin[s[i+L-1]]) & (1<<(L*2) - 1)
        cnt[x]++
        if cnt[x] == 2 {
            ans = append(ans, s[i:i+L])
        }
    }
    return ans
}

java 解法, 执行用时: 27 ms, 内存消耗: 46.9 MB, 提交时间: 2022-12-14 11:57:23

class Solution {
    static final int L = 10;
    Map<Character, Integer> bin = new HashMap<Character, Integer>() {{
        put('A', 0);
        put('C', 1);
        put('G', 2);
        put('T', 3);
    }};

    public List<String> findRepeatedDnaSequences(String s) {
        List<String> ans = new ArrayList<String>();
        int n = s.length();
        if (n <= L) {
            return ans;
        }
        int x = 0;
        for (int i = 0; i < L - 1; ++i) {
            x = (x << 2) | bin.get(s.charAt(i));
        }
        Map<Integer, Integer> cnt = new HashMap<Integer, Integer>();
        for (int i = 0; i <= n - L; ++i) {
            x = ((x << 2) | bin.get(s.charAt(i + L - 1))) & ((1 << (L * 2)) - 1);
            cnt.put(x, cnt.getOrDefault(x, 0) + 1);
            if (cnt.get(x) == 2) {
                ans.add(s.substring(i, i + L));
            }
        }
        return ans;
    }
}

python3 解法, 执行用时: 92 ms, 内存消耗: 25.4 MB, 提交时间: 2022-12-14 11:57:10

'''
哈希表+滑动窗口+位运算
'''
L = 10
bin = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

class Solution:
    def findRepeatedDnaSequences(self, s: str) -> List[str]:
        n = len(s)
        if n <= L:
            return []
        ans = []
        x = 0
        for ch in s[:L - 1]:
            x = (x << 2) | bin[ch]
        cnt = defaultdict(int)
        for i in range(n - L + 1):
            x = ((x << 2) | bin[s[i + L - 1]]) & ((1 << (L * 2)) - 1)
            cnt[x] += 1
            if cnt[x] == 2:
                ans.append(s[i : i + L])
        return ans

javascript 解法, 执行用时: 92 ms, 内存消耗: 54.2 MB, 提交时间: 2022-12-14 11:56:37

/**
 * @param {string} s
 * @return {string[]}
 */
var findRepeatedDnaSequences = function(s) {
    const L = 10;
    const ans = [];
    const cnt = new Map();
    const n = s.length;
    for (let i = 0; i <= n - L; ++i) {
        const sub = s.slice(i, i + L)
        cnt.set(sub, (cnt.get(sub) || 0) + 1);
        if (cnt.get(sub) === 2) {
            ans.push(sub);
        }
    }
    return ans;
};

golang 解法, 执行用时: 16 ms, 内存消耗: 9.2 MB, 提交时间: 2022-12-14 11:56:25

const L = 10

func findRepeatedDnaSequences(s string) (ans []string) {
    cnt := map[string]int{}
    for i := 0; i <= len(s)-L; i++ {
        sub := s[i : i+L]
        cnt[sub]++
        if cnt[sub] == 2 {
            ans = append(ans, sub)
        }
    }
    return
}

java 解法, 执行用时: 21 ms, 内存消耗: 49.9 MB, 提交时间: 2022-12-14 11:56:14

class Solution {
    static final int L = 10;

    public List<String> findRepeatedDnaSequences(String s) {
        List<String> ans = new ArrayList<String>();
        Map<String, Integer> cnt = new HashMap<String, Integer>();
        int n = s.length();
        for (int i = 0; i <= n - L; ++i) {
            String sub = s.substring(i, i + L);
            cnt.put(sub, cnt.getOrDefault(sub, 0) + 1);
            if (cnt.get(sub) == 2) {
                ans.add(sub);
            }
        }
        return ans;
    }
}

python3 解法, 执行用时: 56 ms, 内存消耗: 27.9 MB, 提交时间: 2022-12-14 11:56:00

'''
哈希表

我们可以用一个哈希表统计 s 所有长度为 10 的子串的出现次数，返回所有出现次数超过 1 的子串。
代码实现时，可以一边遍历子串一边记录答案，为了不重复记录答案，我们只统计当前出现次数为 2 的子串。
'''
L = 10

class Solution:
    def findRepeatedDnaSequences(self, s: str) -> List[str]:
        ans = []
        cnt = defaultdict(int)
        for i in range(len(s) - L + 1):
            sub = s[i : i + L]
            cnt[sub] += 1
            if cnt[sub] == 2:
                ans.append(sub)
        return ans

上一题

下一题

详情